alamb commented on code in PR #9619:
URL: https://github.com/apache/arrow-rs/pull/9619#discussion_r3313869989


##########
parquet/src/file/writer.rs:
##########
@@ -1228,7 +1228,7 @@ mod tests {
             // INTERVAL
             ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED),
             // Float16
-            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED),

Review Comment:
   Should we also add tests that cover the other floating point types?



##########
parquet/src/file/statistics.rs:
##########
@@ -139,6 +139,8 @@ pub(crate) fn from_thrift_page_stats(
                 .transpose()?;
             // Generic distinct count (count of distinct values occurring)
             let distinct_count = stats.distinct_count.map(|value| value as 
u64);
+            // Generic nan count for floating point types
+            let nan_count = stats.nan_count.map(|value| value as u64);

Review Comment:
   should we be checking for value fitting in i64 (why do we need to cast to 
u64 🤔 )



##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -2963,10 +2964,120 @@ mod tests {
             for column in row_group.columns() {
                 assert!(column.offset_index_offset().is_some());
                 assert!(column.offset_index_length().is_some());
-                assert!(column.column_index_offset().is_none());
-                assert!(column.column_index_length().is_none());
+                assert!(column.column_index_offset().is_some());
+                assert!(column.column_index_length().is_some());
             }
         }
+        assert!(file_meta_data.column_index().is_some());
+        if let Some(col_indexes) = file_meta_data.column_index() {
+            for rg_idx in col_indexes {
+                for idx in rg_idx {
+                    assert!(idx.nan_counts().is_some());
+                    let float_idx = match idx {
+                        ColumnIndexMetaData::DOUBLE(idx) => idx,
+                        _ => panic!("expected double statistics"),
+                    };
+                    for i in 0..idx.num_pages() as usize {
+                        assert_eq!(float_idx.nan_count(i), Some(10));
+                        assert_eq!(
+                            
f64::NAN.total_cmp(float_idx.min_value(i).unwrap()),

Review Comment:
   that is an interesting way to compare to Nan but it makes sense



##########
parquet/src/file/page_index/column_index.rs:
##########
@@ -635,6 +682,13 @@ impl ColumnIndexMetaData {
         colidx_enum_func!(self, null_count, idx)
     }
 
+    /// Returns the number of NaN values in the page indexed by `idx`
+    ///
+    /// Returns `None` if no null counts have been set in the index

Review Comment:
   ```suggestion
       /// Returns `None` if no Nan counts have been set in the index
   ```



##########
parquet/src/file/page_index/column_index.rs:
##########
@@ -623,6 +653,23 @@ impl ColumnIndexMetaData {
         }
     }
 
+    /// Returns array of NaN counts, one per page.
+    ///
+    /// Returns `None` if now null counts have been set in the index

Review Comment:
   Is this supposed to mean "no" null counts?  Or maybe it should be "Nan 
counts"?
   
   ```suggestion
       /// Returns `None` if no NaN counts have been set in the index
   ```



##########
parquet/src/column/writer/mod.rs:
##########
@@ -1477,80 +1513,106 @@ impl<'a, E: ColumnValueEncoder> 
GenericColumnWriter<'a, E> {
 }
 
 fn update_min<T: ParquetValueType>(descr: &ColumnDescriptor, val: &T, min: 
&mut Option<T>) {
-    update_stat::<T, _>(descr, val, min, |cur| compare_greater(descr, cur, 
val))
+    if min.is_none() {
+        *min = Some(val.clone());
+    } else {
+        // safe to unwrap min since we've already tested for None
+        let basic_type_info = descr.get_basic_info();

Review Comment:
   Could you get rid of the unwrap if you did something like
   ```rust
   let min = min.as_mut().unwrap_or_else(|| val.clone())
   ```



##########
parquet/src/file/statistics.rs:
##########
@@ -263,6 +275,12 @@ pub(crate) fn from_thrift_page_stats(
                         null_count,
                         old_format,
                     )
+                    // Note: We set nan_count here even though we can't verify 
if this is Float16.
+                    // The spec says nan_count should only be set for Float16 
logical type,
+                    // but this function doesn't have access to logical type 
information.
+                    // Writers should only set nan_count for Float16, and 
readers should

Review Comment:
   👍 



##########
parquet/src/column/writer/encoder.rs:
##########
@@ -326,63 +332,52 @@ impl<T: DataType> ColumnValueEncoder for 
ColumnValueEncoderImpl<T> {
     }
 }
 
-fn get_min_max<'a, T, I>(descr: &ColumnDescriptor, mut iter: I) -> Option<(T, 
T)>
+// Get min and max values for all values in `iter`.
+//
+// For floating point we need to compare NaN values until we encounter a 
non-NaN
+// value which then becomes the new min/max. After this, only non-NaN values 
are
+// evaluated. If all values are NaN, then the min/max NaNs as determined by
+// IEEE 754 total order are returned.
+fn get_min_max<'a, T, I>(basic_type_info: &BasicTypeInfo, mut iter: I) -> 
Option<(T, T, u64)>
 where
     T: ParquetValueType + 'a,
     I: Iterator<Item = &'a T>,
 {
-    let first = loop {
-        let next = iter.next()?;
-        if !is_nan(descr, next) {
-            break next;
-        }
-    };
+    let first = iter.next()?;
+    let mut min_max_nan = is_nan(basic_type_info, first);

Review Comment:
   What do you think about maybe paramterizing this function on "can_be_nan" or 
something and then only pass `can_be_nan = true` for the floating point types? 
Then maybe the compiler could generate better code (maybe even faster) for non 
float types



##########
parquet/src/column/writer/mod.rs:
##########
@@ -906,17 +910,31 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, 
E> {
         page_statistics: Option<&ValueStatistics<E::T>>,
         page_variable_length_bytes: Option<i64>,
     ) {
+        // Determine if this is a floating-point column
+        let is_float_column = matches!(self.descr.physical_type(), Type::FLOAT 
| Type::DOUBLE)

Review Comment:
   maybe this would be good as a standalone function (on `Type` perhaps 🤔 )



##########
parquet/src/column/writer/mod.rs:
##########
@@ -906,17 +910,31 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, 
E> {
         page_statistics: Option<&ValueStatistics<E::T>>,
         page_variable_length_bytes: Option<i64>,
     ) {
+        // Determine if this is a floating-point column
+        let is_float_column = matches!(self.descr.physical_type(), Type::FLOAT 
| Type::DOUBLE)

Review Comment:
   It also feels like since this is all templated anyways, I wonder if there is 
some way to pass the "IS_FLOAT" flag as a type parameter so the specialized 
implemntations don't have to check at runtime
   
   This could be a follow on PR of course (or maybe a just a bad idea)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to