This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 46d3f528db Move PruningStatistics into datafusion::common (#16069)
46d3f528db is described below
commit 46d3f528db9f531ab15732d02eeab279ffaeabed
Author: Adrian Garcia Badaracco <[email protected]>
AuthorDate: Wed May 21 05:47:29 2025 -0700
Move PruningStatistics into datafusion::common (#16069)
* Move PruningStatistics into datafusion::common
* fix doc
* remove new code
* fmt
---
datafusion-examples/examples/parquet_index.rs | 3 +-
datafusion-examples/examples/pruning.rs | 3 +-
datafusion/common/src/lib.rs | 1 +
datafusion/common/src/pruning.rs | 124 +++++++++++++++++++++
datafusion/datasource-parquet/src/page_filter.rs | 3 +-
.../datasource-parquet/src/row_group_filter.rs | 3 +-
datafusion/physical-optimizer/src/pruning.rs | 101 +----------------
7 files changed, 134 insertions(+), 104 deletions(-)
diff --git a/datafusion-examples/examples/parquet_index.rs
b/datafusion-examples/examples/parquet_index.rs
index c19fc2561d..e5ae3cc86b 100644
--- a/datafusion-examples/examples/parquet_index.rs
+++ b/datafusion-examples/examples/parquet_index.rs
@@ -23,6 +23,7 @@ use arrow::datatypes::{Int32Type, SchemaRef};
use arrow::util::pretty::pretty_format_batches;
use async_trait::async_trait;
use datafusion::catalog::Session;
+use datafusion::common::pruning::PruningStatistics;
use datafusion::common::{
internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue,
};
@@ -39,7 +40,7 @@ use datafusion::parquet::arrow::{
arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter,
};
use datafusion::physical_expr::PhysicalExpr;
-use datafusion::physical_optimizer::pruning::{PruningPredicate,
PruningStatistics};
+use datafusion::physical_optimizer::pruning::PruningPredicate;
use datafusion::physical_plan::ExecutionPlan;
use datafusion::prelude::*;
use std::any::Any;
diff --git a/datafusion-examples/examples/pruning.rs
b/datafusion-examples/examples/pruning.rs
index 4c802bcdbd..b2d2fa13b7 100644
--- a/datafusion-examples/examples/pruning.rs
+++ b/datafusion-examples/examples/pruning.rs
@@ -20,10 +20,11 @@ use std::sync::Arc;
use arrow::array::{ArrayRef, BooleanArray, Int32Array};
use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use datafusion::common::pruning::PruningStatistics;
use datafusion::common::{DFSchema, ScalarValue};
use datafusion::execution::context::ExecutionProps;
use datafusion::physical_expr::create_physical_expr;
-use datafusion::physical_optimizer::pruning::{PruningPredicate,
PruningStatistics};
+use datafusion::physical_optimizer::pruning::PruningPredicate;
use datafusion::prelude::*;
/// This example shows how to use DataFusion's `PruningPredicate` to prove
diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs
index 2576ab4312..7b2c86d397 100644
--- a/datafusion/common/src/lib.rs
+++ b/datafusion/common/src/lib.rs
@@ -47,6 +47,7 @@ pub mod format;
pub mod hash_utils;
pub mod instant;
pub mod parsers;
+pub mod pruning;
pub mod rounding;
pub mod scalar;
pub mod spans;
diff --git a/datafusion/common/src/pruning.rs b/datafusion/common/src/pruning.rs
new file mode 100644
index 0000000000..014e85eede
--- /dev/null
+++ b/datafusion/common/src/pruning.rs
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, BooleanArray};
+use std::collections::HashSet;
+
+use crate::Column;
+use crate::ScalarValue;
+
+/// A source of runtime statistical information to [`PruningPredicate`]s.
+///
+/// # Supported Information
+///
+/// 1. Minimum and maximum values for columns
+///
+/// 2. Null counts and row counts for columns
+///
+/// 3. Whether the values in a column are contained in a set of literals
+///
+/// # Vectorized Interface
+///
+/// Information for containers / files are returned as Arrow [`ArrayRef`], so
+/// the evaluation happens once on a single `RecordBatch`, which amortizes the
+/// overhead of evaluating the predicate. This is important when pruning 1000s
+/// of containers which often happens in analytic systems that have 1000s of
+/// potential files to consider.
+///
+/// For example, for the following three files with a single column `a`:
+/// ```text
+/// file1: column a: min=5, max=10
+/// file2: column a: No stats
+/// file2: column a: min=20, max=30
+/// ```
+///
+/// PruningStatistics would return:
+///
+/// ```text
+/// min_values("a") -> Some([5, Null, 20])
+/// max_values("a") -> Some([10, Null, 30])
+/// min_values("X") -> None
+/// ```
+///
+/// [`PruningPredicate`]:
https://docs.rs/datafusion/latest/datafusion/physical_optimizer/pruning/struct.PruningPredicate.html
+pub trait PruningStatistics {
+ /// Return the minimum values for the named column, if known.
+ ///
+ /// If the minimum value for a particular container is not known, the
+ /// returned array should have `null` in that row. If the minimum value is
+ /// not known for any row, return `None`.
+ ///
+ /// Note: the returned array must contain [`Self::num_containers`] rows
+ fn min_values(&self, column: &Column) -> Option<ArrayRef>;
+
+ /// Return the maximum values for the named column, if known.
+ ///
+ /// See [`Self::min_values`] for when to return `None` and null values.
+ ///
+ /// Note: the returned array must contain [`Self::num_containers`] rows
+ fn max_values(&self, column: &Column) -> Option<ArrayRef>;
+
+ /// Return the number of containers (e.g. Row Groups) being pruned with
+ /// these statistics.
+ ///
+ /// This value corresponds to the size of the [`ArrayRef`] returned by
+ /// [`Self::min_values`], [`Self::max_values`], [`Self::null_counts`],
+ /// and [`Self::row_counts`].
+ fn num_containers(&self) -> usize;
+
+ /// Return the number of null values for the named column as an
+ /// [`UInt64Array`]
+ ///
+ /// See [`Self::min_values`] for when to return `None` and null values.
+ ///
+ /// Note: the returned array must contain [`Self::num_containers`] rows
+ ///
+ /// [`UInt64Array`]: arrow::array::UInt64Array
+ fn null_counts(&self, column: &Column) -> Option<ArrayRef>;
+
+ /// Return the number of rows for the named column in each container
+ /// as an [`UInt64Array`].
+ ///
+ /// See [`Self::min_values`] for when to return `None` and null values.
+ ///
+ /// Note: the returned array must contain [`Self::num_containers`] rows
+ ///
+ /// [`UInt64Array`]: arrow::array::UInt64Array
+ fn row_counts(&self, column: &Column) -> Option<ArrayRef>;
+
+ /// Returns [`BooleanArray`] where each row represents information known
+ /// about specific literal `values` in a column.
+ ///
+ /// For example, Parquet Bloom Filters implement this API to communicate
+ /// that `values` are known not to be present in a Row Group.
+ ///
+ /// The returned array has one row for each container, with the following
+ /// meanings:
+ /// * `true` if the values in `column` ONLY contain values from `values`
+ /// * `false` if the values in `column` are NOT ANY of `values`
+ /// * `null` if the neither of the above holds or is unknown.
+ ///
+ /// If these statistics can not determine column membership for any
+ /// container, return `None` (the default).
+ ///
+ /// Note: the returned array must contain [`Self::num_containers`] rows
+ fn contained(
+ &self,
+ column: &Column,
+ values: &HashSet<ScalarValue>,
+ ) -> Option<BooleanArray>;
+}
diff --git a/datafusion/datasource-parquet/src/page_filter.rs
b/datafusion/datasource-parquet/src/page_filter.rs
index d4f486fae0..84f5c4c2d6 100644
--- a/datafusion/datasource-parquet/src/page_filter.rs
+++ b/datafusion/datasource-parquet/src/page_filter.rs
@@ -28,9 +28,10 @@ use arrow::{
array::ArrayRef,
datatypes::{Schema, SchemaRef},
};
+use datafusion_common::pruning::PruningStatistics;
use datafusion_common::ScalarValue;
use datafusion_physical_expr::{split_conjunction, PhysicalExpr};
-use datafusion_physical_optimizer::pruning::{PruningPredicate,
PruningStatistics};
+use datafusion_physical_optimizer::pruning::PruningPredicate;
use log::{debug, trace};
use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs
b/datafusion/datasource-parquet/src/row_group_filter.rs
index f6411d6e61..d44fa16843 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -21,9 +21,10 @@ use std::sync::Arc;
use super::{ParquetAccessPlan, ParquetFileMetrics};
use arrow::array::{ArrayRef, BooleanArray};
use arrow::datatypes::Schema;
+use datafusion_common::pruning::PruningStatistics;
use datafusion_common::{Column, Result, ScalarValue};
use datafusion_datasource::FileRange;
-use datafusion_physical_optimizer::pruning::{PruningPredicate,
PruningStatistics};
+use datafusion_physical_optimizer::pruning::PruningPredicate;
use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
use parquet::arrow::parquet_column;
use parquet::basic::Type;
diff --git a/datafusion/physical-optimizer/src/pruning.rs
b/datafusion/physical-optimizer/src/pruning.rs
index 40d93d4647..1beaa0eb00 100644
--- a/datafusion/physical-optimizer/src/pruning.rs
+++ b/datafusion/physical-optimizer/src/pruning.rs
@@ -28,6 +28,7 @@ use arrow::{
datatypes::{DataType, Field, Schema, SchemaRef},
record_batch::{RecordBatch, RecordBatchOptions},
};
+use datafusion_common::pruning::PruningStatistics;
use log::{debug, trace};
use datafusion_common::error::{DataFusionError, Result};
@@ -44,106 +45,6 @@ use datafusion_physical_expr::{expressions as phys_expr,
PhysicalExprRef};
use datafusion_physical_expr_common::physical_expr::snapshot_physical_expr;
use datafusion_physical_plan::{ColumnarValue, PhysicalExpr};
-/// A source of runtime statistical information to [`PruningPredicate`]s.
-///
-/// # Supported Information
-///
-/// 1. Minimum and maximum values for columns
-///
-/// 2. Null counts and row counts for columns
-///
-/// 3. Whether the values in a column are contained in a set of literals
-///
-/// # Vectorized Interface
-///
-/// Information for containers / files are returned as Arrow [`ArrayRef`], so
-/// the evaluation happens once on a single `RecordBatch`, which amortizes the
-/// overhead of evaluating the predicate. This is important when pruning 1000s
-/// of containers which often happens in analytic systems that have 1000s of
-/// potential files to consider.
-///
-/// For example, for the following three files with a single column `a`:
-/// ```text
-/// file1: column a: min=5, max=10
-/// file2: column a: No stats
-/// file2: column a: min=20, max=30
-/// ```
-///
-/// PruningStatistics would return:
-///
-/// ```text
-/// min_values("a") -> Some([5, Null, 20])
-/// max_values("a") -> Some([10, Null, 30])
-/// min_values("X") -> None
-/// ```
-pub trait PruningStatistics {
- /// Return the minimum values for the named column, if known.
- ///
- /// If the minimum value for a particular container is not known, the
- /// returned array should have `null` in that row. If the minimum value is
- /// not known for any row, return `None`.
- ///
- /// Note: the returned array must contain [`Self::num_containers`] rows
- fn min_values(&self, column: &Column) -> Option<ArrayRef>;
-
- /// Return the maximum values for the named column, if known.
- ///
- /// See [`Self::min_values`] for when to return `None` and null values.
- ///
- /// Note: the returned array must contain [`Self::num_containers`] rows
- fn max_values(&self, column: &Column) -> Option<ArrayRef>;
-
- /// Return the number of containers (e.g. Row Groups) being pruned with
- /// these statistics.
- ///
- /// This value corresponds to the size of the [`ArrayRef`] returned by
- /// [`Self::min_values`], [`Self::max_values`], [`Self::null_counts`],
- /// and [`Self::row_counts`].
- fn num_containers(&self) -> usize;
-
- /// Return the number of null values for the named column as an
- /// [`UInt64Array`]
- ///
- /// See [`Self::min_values`] for when to return `None` and null values.
- ///
- /// Note: the returned array must contain [`Self::num_containers`] rows
- ///
- /// [`UInt64Array`]: arrow::array::UInt64Array
- fn null_counts(&self, column: &Column) -> Option<ArrayRef>;
-
- /// Return the number of rows for the named column in each container
- /// as an [`UInt64Array`].
- ///
- /// See [`Self::min_values`] for when to return `None` and null values.
- ///
- /// Note: the returned array must contain [`Self::num_containers`] rows
- ///
- /// [`UInt64Array`]: arrow::array::UInt64Array
- fn row_counts(&self, column: &Column) -> Option<ArrayRef>;
-
- /// Returns [`BooleanArray`] where each row represents information known
- /// about specific literal `values` in a column.
- ///
- /// For example, Parquet Bloom Filters implement this API to communicate
- /// that `values` are known not to be present in a Row Group.
- ///
- /// The returned array has one row for each container, with the following
- /// meanings:
- /// * `true` if the values in `column` ONLY contain values from `values`
- /// * `false` if the values in `column` are NOT ANY of `values`
- /// * `null` if the neither of the above holds or is unknown.
- ///
- /// If these statistics can not determine column membership for any
- /// container, return `None` (the default).
- ///
- /// Note: the returned array must contain [`Self::num_containers`] rows
- fn contained(
- &self,
- column: &Column,
- values: &HashSet<ScalarValue>,
- ) -> Option<BooleanArray>;
-}
-
/// Used to prove that arbitrary predicates (boolean expression) can not
/// possibly evaluate to `true` given information about a column provided by
/// [`PruningStatistics`].
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]