(datafusion) branch main updated: Minor: improve ParquetOpener docs (#12456)

alamb Sun, 15 Sep 2024 05:00:10 -0700

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 6519f8e7fd Minor: improve ParquetOpener docs (#12456)
6519f8e7fd is described below

commit 6519f8e7fd9e9b0d9be849f7914fe8b26fcd99b8
Author: Andrew Lamb <[email protected]>
AuthorDate: Sun Sep 15 07:59:42 2024 -0400

    Minor: improve ParquetOpener docs (#12456)
---
 .../src/datasource/physical_plan/parquet/opener.rs   | 20 ++++++++++++++++++++
 datafusion/core/src/datasource/schema_adapter.rs     | 15 ++++++++-------
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/opener.rs 
b/datafusion/core/src/datasource/physical_plan/parquet/opener.rs
index a12e60eb41..2a198c3d45 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/opener.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/opener.rs
@@ -41,21 +41,41 @@ use std::sync::Arc;
 
 /// Implements [`FileOpener`] for a parquet file
 pub(super) struct ParquetOpener {
+    /// Execution partition index
     pub partition_index: usize,
+    /// Column indexes in `table_schema` needed by the query
     pub projection: Arc<[usize]>,
+    /// Target number of rows in each output RecordBatch
     pub batch_size: usize,
+    /// Optional limit on the number of rows to read
     pub limit: Option<usize>,
+    /// Optional predicate to apply during the scan
     pub predicate: Option<Arc<dyn PhysicalExpr>>,
+    /// Optional pruning predicate applied to row group statistics
     pub pruning_predicate: Option<Arc<PruningPredicate>>,
+    /// Optional pruning predicate applied to data page statistics
     pub page_pruning_predicate: Option<Arc<PagePruningAccessPlanFilter>>,
+    /// Schema of the output table
     pub table_schema: SchemaRef,
+    /// Optional hint for how large the initial request to read parquet 
metadata
+    /// should be
     pub metadata_size_hint: Option<usize>,
+    /// Metrics for reporting
     pub metrics: ExecutionPlanMetricsSet,
+    /// Factory for instantiating parquet reader
     pub parquet_file_reader_factory: Arc<dyn ParquetFileReaderFactory>,
+    /// Should the filters be evaluated during the parquet scan using
+    /// [`DataFusionArrowPredicate`](row_filter::DatafusionArrowPredicate)?
     pub pushdown_filters: bool,
+    /// Should the filters be reordered to optimize the scan?
     pub reorder_filters: bool,
+    /// Should the page index be read from parquet files, if present, to skip
+    /// data pages
     pub enable_page_index: bool,
+    /// Should the bloom filter be read from parquet, if present, to skip row
+    /// groups
     pub enable_bloom_filter: bool,
+    /// Schema adapter factory
     pub schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
 }
 
diff --git a/datafusion/core/src/datasource/schema_adapter.rs 
b/datafusion/core/src/datasource/schema_adapter.rs
index 5d2d0ff91b..de508f2c34 100644
--- a/datafusion/core/src/datasource/schema_adapter.rs
+++ b/datafusion/core/src/datasource/schema_adapter.rs
@@ -73,17 +73,18 @@ pub trait SchemaAdapter: Send + Sync {
     ) -> datafusion_common::Result<(Arc<dyn SchemaMapper>, Vec<usize>)>;
 }
 
-/// Creates a `SchemaMapping` that can be used to cast or map the columns
-/// from the file schema to the table schema.
+/// Maps, by casting or reordering columns from the file schema to the table
+/// schema.
 pub trait SchemaMapper: Debug + Send + Sync {
-    /// Adapts a `RecordBatch` to match the `table_schema` using the stored 
mapping and conversions.
+    /// Adapts a `RecordBatch` to match the `table_schema` using the stored
+    /// mapping and conversions.
     fn map_batch(&self, batch: RecordBatch) -> 
datafusion_common::Result<RecordBatch>;
 
-    /// Adapts a [`RecordBatch`] that does not  have all the columns from the
+    /// Adapts a [`RecordBatch`] that does not have all the columns from the
     /// file schema.
     ///
-    /// This method is used when applying a filter to a subset of the columns 
during
-    /// an `ArrowPredicate`.
+    /// This method is used when applying a filter to a subset of the columns 
as
+    /// part of `DataFusionArrowPredicate` when `filter_pushdown` is enabled.
     ///
     /// This method is slower than `map_batch` as it looks up columns by name.
     fn map_partial_batch(
@@ -92,7 +93,7 @@ pub trait SchemaMapper: Debug + Send + Sync {
     ) -> datafusion_common::Result<RecordBatch>;
 }
 
-/// Basic implementation of [`SchemaAdapterFactory`] that maps columns by name
+/// Implementation of [`SchemaAdapterFactory`] that maps columns by name
 /// and casts columns to the expected type.
 #[derive(Clone, Debug, Default)]
 pub struct DefaultSchemaAdapterFactory {}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion) branch main updated: Minor: improve ParquetOpener docs (#12456)

Reply via email to