Michael-J-Ward commented on code in PR #6216:
URL: https://github.com/apache/arrow-rs/pull/6216#discussion_r1712044280


##########
parquet/src/arrow/arrow_reader/statistics.rs:
##########
@@ -131,24 +131,64 @@ macro_rules! make_stats_iterator {
 
 make_stats_iterator!(
     MinBooleanStatsIterator,
-    min,
+    min_unchecked,
     ParquetStatistics::Boolean,
     bool
 );
 make_stats_iterator!(
     MaxBooleanStatsIterator,
-    max,
+    max_unchecked,
     ParquetStatistics::Boolean,
     bool
 );
-make_stats_iterator!(MinInt32StatsIterator, min, ParquetStatistics::Int32, 
i32);
-make_stats_iterator!(MaxInt32StatsIterator, max, ParquetStatistics::Int32, 
i32);
-make_stats_iterator!(MinInt64StatsIterator, min, ParquetStatistics::Int64, 
i64);
-make_stats_iterator!(MaxInt64StatsIterator, max, ParquetStatistics::Int64, 
i64);
-make_stats_iterator!(MinFloatStatsIterator, min, ParquetStatistics::Float, 
f32);
-make_stats_iterator!(MaxFloatStatsIterator, max, ParquetStatistics::Float, 
f32);
-make_stats_iterator!(MinDoubleStatsIterator, min, ParquetStatistics::Double, 
f64);
-make_stats_iterator!(MaxDoubleStatsIterator, max, ParquetStatistics::Double, 
f64);
+make_stats_iterator!(
+    MinInt32StatsIterator,
+    min_unchecked,
+    ParquetStatistics::Int32,
+    i32
+);
+make_stats_iterator!(
+    MaxInt32StatsIterator,
+    max_unchecked,
+    ParquetStatistics::Int32,
+    i32
+);
+make_stats_iterator!(
+    MinInt64StatsIterator,
+    min_unchecked,
+    ParquetStatistics::Int64,
+    i64
+);
+make_stats_iterator!(
+    MaxInt64StatsIterator,
+    max_unchecked,
+    ParquetStatistics::Int64,
+    i64
+);
+make_stats_iterator!(
+    MinFloatStatsIterator,
+    min_unchecked,
+    ParquetStatistics::Float,
+    f32
+);
+make_stats_iterator!(
+    MaxFloatStatsIterator,
+    max_unchecked,
+    ParquetStatistics::Float,
+    f32
+);
+make_stats_iterator!(
+    MinDoubleStatsIterator,
+    min_unchecked,
+    ParquetStatistics::Double,
+    f64
+);
+make_stats_iterator!(
+    MaxDoubleStatsIterator,
+    max_unchecked,
+    ParquetStatistics::Double,
+    f64
+);
 make_stats_iterator!(
     MinByteArrayStatsIterator,
     min_bytes,

Review Comment:
   If we change `Statistics::{min,max}_bytes` to return `Option<&[u8]>`, The 
use of `*_unchecked` methods can be switched to the new imps and just use 
`unwrap`



##########
parquet/src/file/statistics.rs:
##########
@@ -601,6 +601,7 @@ impl<T: ParquetValueType> ValueStatistics<T> {
     /// using signed comparison. This resulted in an undefined ordering for 
unsigned
     /// quantities, such as booleans and unsigned integers.
     ///
+    // TODO: I don't see `min_value` or `max_value` in this struct. Should 
these be something else?

Review Comment:
   And follow-up in case there should be other fields, should `min_bytes` 
referr to the deprecated `min` or to the new `min_value`



##########
parquet/src/file/statistics.rs:
##########
@@ -538,37 +537,47 @@ impl<T: ParquetValueType> ValueStatistics<T> {
     ///
     /// Panics if min value is not set, e.g. all values are `null`.
     /// Use `has_min_max_set` method to check that.
-    pub fn min(&self) -> &T {
+    pub(crate) fn min_unchecked(&self) -> &T {
         self.min.as_ref().unwrap()
     }
 
+    /// Returns min value of the statistics.
+    pub fn min(&self) -> Option<&T> {
+        self.min.as_ref()
+    }
+
     /// Returns max value of the statistics.
     ///
     /// Panics if max value is not set, e.g. all values are `null`.
     /// Use `has_min_max_set` method to check that.
-    pub fn max(&self) -> &T {
+    pub(crate) fn max_unchecked(&self) -> &T {
         self.max.as_ref().unwrap()
     }
 
+    /// Returns max value of the statistics.
+    pub fn max(&self) -> Option<&T> {
+        self.max.as_ref()
+    }
+
     /// Returns min value as bytes of the statistics.
     ///
     /// Panics if min value is not set, use `has_min_max_set` method to check
     /// if values are set.
     pub fn min_bytes(&self) -> &[u8] {

Review Comment:
   Should `min_bytes` and `max_bytes` be updated to return `Option<&[u8]>`



##########
parquet/src/file/statistics.rs:
##########
@@ -385,18 +380,13 @@ impl Statistics {
 
     /// Returns number of null values for the column.
     /// Note that this includes all nulls when column is part of the complex 
type.
-    pub fn null_count(&self) -> u64 {
+    pub fn null_count(&self) -> Option<u64> {
         statistics_enum_func![self, null_count]
     }
 
-    /// Returns `true` if statistics collected any null values, `false` 
otherwise.
-    pub fn has_nulls(&self) -> bool {
-        self.null_count() > 0
-    }
-
     /// Returns `true` if min value and max value are set.
     /// Normally both min/max values will be set to `Some(value)` or `None`.
-    pub fn has_min_max_set(&self) -> bool {
+    pub(crate) fn has_min_max_set(&self) -> bool {

Review Comment:
   This method is used both in tests for assertions and for logical flow in 
source code, so I kept it as internal to this crate.



##########
parquet/src/column/writer/mod.rs:
##########
@@ -769,8 +769,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
                 }
                 Some(stat) => {
                     // Check if min/max are still ascending/descending across 
pages
-                    let new_min = stat.min();
-                    let new_max = stat.max();
+                    let new_min = stat.min_unchecked();

Review Comment:
   Previously, this method did not check `has_minx_max_set` before calling 
`stat.min()`.
   
   Should it have or is it guaranteed to already be set by this point?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to