alamb commented on code in PR #10647: URL: https://github.com/apache/datafusion/pull/10647#discussion_r1614666750
########## datafusion/core/src/datasource/physical_plan/parquet/mod.rs: ########## @@ -642,11 +714,22 @@ fn should_enable_page_index( .unwrap_or(false) } -/// Factory of parquet file readers. +/// Interface for creating [`AsyncFileReader`]s to read parquet files. +/// +/// This interface is used by [`ParquetOpener`] in order to create readers for +/// parquet files. Implementations of this trait can be used to provide custom Review Comment: Excellent idea. I did so ########## datafusion/core/src/datasource/physical_plan/parquet/mod.rs: ########## @@ -75,7 +75,79 @@ pub use metrics::ParquetFileMetrics; pub use schema_adapter::{SchemaAdapter, SchemaAdapterFactory, SchemaMapper}; pub use statistics::{RequestedStatistics, StatisticsConverter}; -/// Execution plan for scanning one or more Parquet partitions +/// Execution plan for reading one or more Parquet files. +/// +/// ```text +/// ▲ +/// │ +/// │ Produce a stream of +/// │ RecordBatches +/// │ +/// ┌───────────────────────┐ +/// │ │ +/// │ ParquetExec │ +/// │ │ +/// └───────────────────────┘ +/// ▲ +/// │ Asynchronously read from one +/// │ or more parquet files via +/// │ ObjectStore interface +/// │ +/// │ +/// .───────────────────. +/// │ ) +/// │`───────────────────'│ +/// │ ObjectStore │ +/// │.───────────────────.│ +/// │ ) +/// `───────────────────' +/// +/// ``` +/// # Features +/// +/// Supports the following optimizations: +/// +/// * Multi-threaded (aka multi-partition): read from one or more files in +/// parallel. Can read concurrently from multiple row groups from a single file. +/// +/// * Predicate push down: skips row groups and pages based on +/// min/max/null_counts in the row group metadata, the page index and bloom +/// filters. +/// +/// * Projection pushdown: reads and decodes only the columns required. +/// +/// * Limit pushdown: stop execution early after some number of rows are read. +/// +/// * Custom readers: controls I/O for accessing pages. See Review Comment: good call -- updated -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org