alamb commented on code in PR #20004:
URL: https://github.com/apache/datafusion/pull/20004#discussion_r2727350462
##########
datafusion/datasource-parquet/src/metadata.rs:
##########
@@ -487,22 +487,38 @@ fn summarize_min_max_null_counts(
if let Some(max_acc) = &mut accumulators.max_accs[logical_schema_index] {
max_acc.update_batch(&[Arc::clone(&max_values)])?;
- let mut cur_max_acc = max_acc.clone();
- accumulators.is_max_value_exact[logical_schema_index] =
has_any_exact_match(
- &cur_max_acc.evaluate()?,
- &max_values,
- &is_max_value_exact_stat,
- );
+
+ let exactness = &is_max_value_exact_stat;
Review Comment:
I think it might help future readers to specify what this is doing
```suggestion
// handle the common special case when all row groups have exact
statistics
let exactness = &is_max_value_exact_stat;
```
Another way that might make it easier to understand would be to pull this
into a separate function
##########
datafusion/datasource-parquet/src/metadata.rs:
##########
@@ -487,22 +487,38 @@ fn summarize_min_max_null_counts(
if let Some(max_acc) = &mut accumulators.max_accs[logical_schema_index] {
max_acc.update_batch(&[Arc::clone(&max_values)])?;
- let mut cur_max_acc = max_acc.clone();
- accumulators.is_max_value_exact[logical_schema_index] =
has_any_exact_match(
- &cur_max_acc.evaluate()?,
- &max_values,
- &is_max_value_exact_stat,
- );
+
+ let exactness = &is_max_value_exact_stat;
+ if !exactness.is_empty()
+ && exactness.null_count() == 0
+ && exactness.true_count() == exactness.len()
+ {
+ accumulators.is_max_value_exact[logical_schema_index] = Some(true);
Review Comment:
It does seem like this fast path will be hit often (as it is when all stats
are exact, a common case).
##########
datafusion/datasource-parquet/src/metadata.rs:
##########
@@ -582,6 +598,15 @@ fn has_any_exact_match(
array: &ArrayRef,
exactness: &BooleanArray,
) -> Option<bool> {
+ if value.is_null() {
+ return Some(false);
+ }
+
+ // Shortcut for single row group
+ if array.len() == 1 {
+ return Some(exactness.is_valid(0) && exactness.value(0));
+ }
+
let scalar_array = value.to_scalar().ok()?;
let eq_mask = eq(&scalar_array, &array).ok()?;
let combined_mask = and(&eq_mask, exactness).ok()?;
Review Comment:
As an aside, there is the type if pattern where I would really like an API
that can reuse the allocation
- https://github.com/apache/arrow-rs/issues/8809
This could easily reuse the allocation of eq_mask
(not tis PR)
##########
datafusion/datasource-parquet/src/metadata.rs:
##########
@@ -487,22 +487,38 @@ fn summarize_min_max_null_counts(
if let Some(max_acc) = &mut accumulators.max_accs[logical_schema_index] {
max_acc.update_batch(&[Arc::clone(&max_values)])?;
- let mut cur_max_acc = max_acc.clone();
- accumulators.is_max_value_exact[logical_schema_index] =
has_any_exact_match(
- &cur_max_acc.evaluate()?,
- &max_values,
- &is_max_value_exact_stat,
- );
+
+ let exactness = &is_max_value_exact_stat;
+ if !exactness.is_empty()
+ && exactness.null_count() == 0
+ && exactness.true_count() == exactness.len()
+ {
+ accumulators.is_max_value_exact[logical_schema_index] = Some(true);
+ } else if exactness.true_count() == 0 {
+ accumulators.is_max_value_exact[logical_schema_index] =
Some(false);
+ } else {
+ let val = max_acc.evaluate()?;
+ accumulators.is_max_value_exact[logical_schema_index] =
+ has_any_exact_match(&val, &max_values, exactness);
+ }
}
if let Some(min_acc) = &mut accumulators.min_accs[logical_schema_index] {
min_acc.update_batch(&[Arc::clone(&min_values)])?;
- let mut cur_min_acc = min_acc.clone();
- accumulators.is_min_value_exact[logical_schema_index] =
has_any_exact_match(
- &cur_min_acc.evaluate()?,
- &min_values,
- &is_min_value_exact_stat,
- );
+
+ let exactness = &is_min_value_exact_stat;
Review Comment:
```suggestion
// handle the common special case when all row groups have exact
statistics
let exactness = &is_min_value_exact_stat;
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]