alamb commented on PR #7401: URL: https://github.com/apache/arrow-rs/pull/7401#issuecomment-2802658761
I also verified the filter patterns like this: <details><summary>Patch</summary> <p> ```diff diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 8bbe175daf..11ceaed569 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -976,7 +976,9 @@ pub(crate) fn evaluate_predicate( input_selection: Option<RowSelection>, predicate: &mut dyn ArrowPredicate, ) -> Result<RowSelection> { + println!("Evaluating predicate, batch_size: {batch_size}, input_selection: {:?}", input_selection); let reader = ParquetRecordBatchReader::new(batch_size, array_reader, input_selection.clone()); + let mut total_input_rows = 0; let mut filters = vec![]; for maybe_batch in reader { let maybe_batch = maybe_batch?; @@ -993,9 +995,15 @@ pub(crate) fn evaluate_predicate( 0 => filters.push(filter), _ => filters.push(prep_null_mask_filter(&filter)), }; + total_input_rows += input_rows; } let raw = RowSelection::from_filters(&filters); + let selected_rows = raw.row_count(); + let num_selections = raw.iter().count(); + let selectivity = 100.0* (selected_rows as f64 / total_input_rows as f64); + println!(" Selected {selected_rows} rows in {num_selections} selections ({selectivity:.3}%)", ); + println!(" RowSelection: {}", raw); Ok(match input_selection { Some(selection) => selection.and_then(&raw), None => raw, diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index c53d47be2e..475b06315d 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -19,6 +19,7 @@ use arrow_array::{Array, BooleanArray}; use arrow_select::filter::SlicesIterator; use std::cmp::Ordering; use std::collections::VecDeque; +use std::fmt::{Display, Formatter}; use std::ops::Range; /// [`RowSelection`] is a collection of [`RowSelector`] used to skip rows when @@ -32,6 +33,16 @@ pub struct RowSelector { pub skip: bool, } +impl Display for RowSelector { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + if self.skip { + write!(f, "skip({})", self.row_count) + } else { + write!(f, "select({})", self.row_count) + } + } +} + impl RowSelector { /// Select `row_count` rows pub fn select(row_count: usize) -> Self { @@ -101,6 +112,22 @@ pub struct RowSelection { selectors: Vec<RowSelector>, } +/// Prints a human understandable representation of the RowSelection +impl Display for RowSelection { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "[")?; + let mut selectors = self.selectors.iter(); + + if let Some(first) = selectors.next() { + write!(f, "{}", first)?; + for selector in selectors { + write!(f, " {}", selector)?; + } + } + write!(f, "]") + } +} + impl RowSelection { /// Creates a [`RowSelection`] from a slice of [`BooleanArray`] /// ``` </p> </details> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org