viirya commented on code in PR #8394:
URL: https://github.com/apache/arrow-datafusion/pull/8394#discussion_r1412857747


##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -66,43 +66,57 @@ use log::trace;
 /// min_values("X") -> None
 /// ```
 pub trait PruningStatistics {
-    /// return the minimum values for the named column, if known.
-    /// Note: the returned array must contain `num_containers()` rows
+    /// Return the minimum values for the named column, if known.
+    ///
+    /// If the minimum value for a particular container is not known, the
+    /// returned array should have `null` in that row. If the minimum value is
+    /// not know for any row, return `None`.
+    ///
+    /// Note: the returned array must contain [`Self::num_containers`] rows
     fn min_values(&self, column: &Column) -> Option<ArrayRef>;
 
-    /// return the maximum values for the named column, if known.
-    /// Note: the returned array must contain `num_containers()` rows.
+    /// Return the maximum values for the named column, if known.
+    ///
+    /// See [`Self::min_values`] for when to return `None` and null values.
+    ///
+    /// Note: the returned array must contain [`Self::num_containers`] rows
     fn max_values(&self, column: &Column) -> Option<ArrayRef>;
 
-    /// return the number of containers (e.g. row groups) being
-    /// pruned with these statistics
+    /// Return the number of containers (e.g. row groups) being
+    /// pruned with these statistics (the number of rows in each returned 
array)
     fn num_containers(&self) -> usize;
 
-    /// return the number of null values for the named column as an
+    /// Return the number of null values for the named column as an
     /// `Option<UInt64Array>`.
     ///
-    /// Note: the returned array must contain `num_containers()` rows.
+    /// See [`Self::min_values`] for when to return `None` and null values.
+    ///
+    /// Note: the returned array must contain [`Self::num_containers`] rows
     fn null_counts(&self, column: &Column) -> Option<ArrayRef>;
 }
 
-/// Evaluates filter expressions on statistics, rather than the actual data. If
-/// no rows could possibly pass the filter entire containers can be "pruned"
-/// (skipped), without reading any actual data, leading to significant
+/// Evaluates filter expressions on statistics such as min/max values and null
+/// counts, attempting to prove a "container" (e.g. Parquet Row Group) can be
+/// skipped without reading the actual data, potentially leading to significant
 /// performance improvements.
 ///
-/// [`PruningPredicate`]s are used to prune (avoid scanning) Parquet Row Groups
+/// For example, [`PruningPredicate`]s are used to prune Parquet Row Groups
 /// based on the min/max values found in the Parquet metadata. If the
 /// `PruningPredicate` can guarantee that no rows in the Row Group match the
 /// filter, the entire Row Group is skipped during query execution.
 ///
-/// Note that this API is designed to be general, as it works:
+/// The `PruningPredicate` API is general, allowing it to be used for pruning
+/// other types of containers (e.g. files) based on  statistics that may be

Review Comment:
   ```suggestion
   /// other types of containers (e.g. files) based on statistics that may be
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to