This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 6519f8e7fd Minor: improve ParquetOpener docs (#12456)
6519f8e7fd is described below
commit 6519f8e7fd9e9b0d9be849f7914fe8b26fcd99b8
Author: Andrew Lamb <[email protected]>
AuthorDate: Sun Sep 15 07:59:42 2024 -0400
Minor: improve ParquetOpener docs (#12456)
---
.../src/datasource/physical_plan/parquet/opener.rs | 20 ++++++++++++++++++++
datafusion/core/src/datasource/schema_adapter.rs | 15 ++++++++-------
2 files changed, 28 insertions(+), 7 deletions(-)
diff --git a/datafusion/core/src/datasource/physical_plan/parquet/opener.rs
b/datafusion/core/src/datasource/physical_plan/parquet/opener.rs
index a12e60eb41..2a198c3d45 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/opener.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/opener.rs
@@ -41,21 +41,41 @@ use std::sync::Arc;
/// Implements [`FileOpener`] for a parquet file
pub(super) struct ParquetOpener {
+ /// Execution partition index
pub partition_index: usize,
+ /// Column indexes in `table_schema` needed by the query
pub projection: Arc<[usize]>,
+ /// Target number of rows in each output RecordBatch
pub batch_size: usize,
+ /// Optional limit on the number of rows to read
pub limit: Option<usize>,
+ /// Optional predicate to apply during the scan
pub predicate: Option<Arc<dyn PhysicalExpr>>,
+ /// Optional pruning predicate applied to row group statistics
pub pruning_predicate: Option<Arc<PruningPredicate>>,
+ /// Optional pruning predicate applied to data page statistics
pub page_pruning_predicate: Option<Arc<PagePruningAccessPlanFilter>>,
+ /// Schema of the output table
pub table_schema: SchemaRef,
+ /// Optional hint for how large the initial request to read parquet
metadata
+ /// should be
pub metadata_size_hint: Option<usize>,
+ /// Metrics for reporting
pub metrics: ExecutionPlanMetricsSet,
+ /// Factory for instantiating parquet reader
pub parquet_file_reader_factory: Arc<dyn ParquetFileReaderFactory>,
+ /// Should the filters be evaluated during the parquet scan using
+ /// [`DataFusionArrowPredicate`](row_filter::DatafusionArrowPredicate)?
pub pushdown_filters: bool,
+ /// Should the filters be reordered to optimize the scan?
pub reorder_filters: bool,
+ /// Should the page index be read from parquet files, if present, to skip
+ /// data pages
pub enable_page_index: bool,
+ /// Should the bloom filter be read from parquet, if present, to skip row
+ /// groups
pub enable_bloom_filter: bool,
+ /// Schema adapter factory
pub schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
}
diff --git a/datafusion/core/src/datasource/schema_adapter.rs
b/datafusion/core/src/datasource/schema_adapter.rs
index 5d2d0ff91b..de508f2c34 100644
--- a/datafusion/core/src/datasource/schema_adapter.rs
+++ b/datafusion/core/src/datasource/schema_adapter.rs
@@ -73,17 +73,18 @@ pub trait SchemaAdapter: Send + Sync {
) -> datafusion_common::Result<(Arc<dyn SchemaMapper>, Vec<usize>)>;
}
-/// Creates a `SchemaMapping` that can be used to cast or map the columns
-/// from the file schema to the table schema.
+/// Maps, by casting or reordering columns from the file schema to the table
+/// schema.
pub trait SchemaMapper: Debug + Send + Sync {
- /// Adapts a `RecordBatch` to match the `table_schema` using the stored
mapping and conversions.
+ /// Adapts a `RecordBatch` to match the `table_schema` using the stored
+ /// mapping and conversions.
fn map_batch(&self, batch: RecordBatch) ->
datafusion_common::Result<RecordBatch>;
- /// Adapts a [`RecordBatch`] that does not have all the columns from the
+ /// Adapts a [`RecordBatch`] that does not have all the columns from the
/// file schema.
///
- /// This method is used when applying a filter to a subset of the columns
during
- /// an `ArrowPredicate`.
+ /// This method is used when applying a filter to a subset of the columns
as
+ /// part of `DataFusionArrowPredicate` when `filter_pushdown` is enabled.
///
/// This method is slower than `map_batch` as it looks up columns by name.
fn map_partial_batch(
@@ -92,7 +93,7 @@ pub trait SchemaMapper: Debug + Send + Sync {
) -> datafusion_common::Result<RecordBatch>;
}
-/// Basic implementation of [`SchemaAdapterFactory`] that maps columns by name
+/// Implementation of [`SchemaAdapterFactory`] that maps columns by name
/// and casts columns to the expected type.
#[derive(Clone, Debug, Default)]
pub struct DefaultSchemaAdapterFactory {}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]