alamb commented on code in PR #21637:
URL: https://github.com/apache/datafusion/pull/21637#discussion_r3233928827
##########
datafusion/physical-expr-common/src/metrics/value.rs:
##########
@@ -1010,7 +1010,10 @@ impl MetricValue {
Self::SpilledBytes(_) => 11,
Self::SpilledRows(_) => 12,
Self::CurrentMemoryUsage(_) => 13,
- Self::Count { .. } => 14,
+ Self::Count { name, .. } => match name.as_ref() {
+ "page_index_pages_skipped_by_fully_matched" => 8,
Review Comment:
this may be worth a comment to explain why it is special casing
page_index_pages_skipped_by_fully_matched
##########
datafusion/datasource-parquet/src/opener.rs:
##########
@@ -1075,41 +1076,54 @@ impl RowGroupsPrunedParquetOpen {
let file_metadata = Arc::clone(reader_metadata.metadata());
let rg_metadata = file_metadata.row_groups();
- // Filter pushdown: evaluate predicates during scan
- let row_filter = if let Some(predicate) = prepared
+ // Filter pushdown: evaluate predicates during scan.
+ // Keep the predicate around so we can rebuild RowFilter per decoder
run
+ // when fully matched row groups split the scan into multiple decoders.
+ let pushdown_predicate = prepared
.pushdown_filters
.then_some(prepared.predicate.clone())
- .flatten()
- {
- let row_filter = row_filter::build_row_filter(
- &predicate,
- &prepared.physical_file_schema,
- file_metadata.as_ref(),
- prepared.reorder_predicates,
- &prepared.file_metrics,
- );
+ .flatten();
- match row_filter {
- Ok(Some(filter)) => Some(filter),
- Ok(None) => None,
- Err(e) => {
- debug!("Ignoring error building row filter for
'{predicate:?}': {e}");
- None
+ let try_build_row_filter =
+ |predicate: &Arc<dyn PhysicalExpr>| -> Option<RowFilter> {
+ match row_filter::build_row_filter(
+ predicate,
+ &prepared.physical_file_schema,
+ file_metadata.as_ref(),
+ prepared.reorder_predicates,
+ &prepared.file_metrics,
+ ) {
+ Ok(Some(filter)) => Some(filter),
+ Ok(None) => None,
+ Err(e) => {
+ debug!(
+ "Ignoring error building row filter for
'{predicate:?}': {e}"
+ );
+ None
+ }
}
- }
- } else {
- None
- };
+ };
+
+ // Build the first RowFilter eagerly; it will be reused for the first
Review Comment:
SOunds good.
I think @adriangb was also talking recently about restructuing the Parquet
opener so it could decide more dynamically decide how to evaluate predicates
(in this case for example it decides not to evaluate a predicate at all). He
was also thinking we could dynamically choose between pushdown predicate into
the scan or not
no action required for this PR, I am just commenting here that we seem to be
treding in this direction
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]