This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 2956dbf30f fix: Do not assume missing nullcount stat means zero 
nullcount (#9481)
2956dbf30f is described below

commit 2956dbf30fe5b50f8f76e6bad93505a8e7b86eb5
Author: Ryan Johnson <[email protected]>
AuthorDate: Wed Mar 11 12:46:51 2026 -0600

    fix: Do not assume missing nullcount stat means zero nullcount (#9481)
    
    # Which issue does this PR close?
    
    - Closes https://github.com/apache/arrow-rs/issues/9451
    - Closes https://github.com/apache/arrow-rs/issues/6256
    
    # Rationale for this change
    
    A reader might be annoyed (performance wart) if a parquet footer lacks
    nullcount stats, but inferring nullcount=0 for missing stats makes the
    stats untrustworthy and can lead to incorrect behavior.
    
    # What changes are included in this PR?
    
    If a parquet footer nullcount stat is missing, surface it as None,
    reserving `Some(0)` for known-no-null cases.
    
    # Are these changes tested?
    
    Fixed one unit test that broke, added a missing unit test that covers
    the other change site.
    
    # Are there any user-facing changes?
    
    The stats API doesn't change signature, but there is a behavior change.
    The existing doc that called out the incorrect behavior has been removed
    to reflect that the incorrect behavior no longer occurs.
---
 parquet/src/file/metadata/thrift/mod.rs | 73 ++++++++++++++++++++++++++-------
 parquet/src/file/statistics.rs          | 56 ++++++++++++-------------
 2 files changed, 85 insertions(+), 44 deletions(-)

diff --git a/parquet/src/file/metadata/thrift/mod.rs 
b/parquet/src/file/metadata/thrift/mod.rs
index ddb5aa16b0..88cb96f355 100644
--- a/parquet/src/file/metadata/thrift/mod.rs
+++ b/parquet/src/file/metadata/thrift/mod.rs
@@ -192,20 +192,19 @@ fn convert_stats(
     use crate::file::statistics::Statistics as FStatistics;
     Ok(match thrift_stats {
         Some(stats) => {
-            // Number of nulls recorded, when it is not available, we just 
mark it as 0.
-            // TODO this should be `None` if there is no information about 
NULLS.
-            // see https://github.com/apache/arrow-rs/pull/6216/files
-            let null_count = stats.null_count.unwrap_or(0);
-
-            if null_count < 0 {
-                return Err(general_err!(
-                    "Statistics null count is negative {}",
-                    null_count
-                ));
-            }
-
             // Generic null count.
-            let null_count = Some(null_count as u64);
+            let null_count = stats
+                .null_count
+                .map(|null_count| {
+                    if null_count < 0 {
+                        return Err(general_err!(
+                            "Statistics null count is negative {}",
+                            null_count
+                        ));
+                    }
+                    Ok(null_count as u64)
+                })
+                .transpose()?;
             // Generic distinct count (count of distinct values occurring)
             let distinct_count = stats.distinct_count.map(|value| value as 
u64);
             // Whether or not statistics use deprecated min/max fields.
@@ -1722,6 +1721,7 @@ write_thrift_field!(RustBoundingBox, FieldType::Struct);
 
 #[cfg(test)]
 pub(crate) mod tests {
+    use crate::basic::Type as PhysicalType;
     use crate::errors::Result;
     use crate::file::metadata::thrift::{BoundingBox, SchemaElement, 
write_schema};
     use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaDataOptions, 
RowGroupMetaData};
@@ -1730,7 +1730,8 @@ pub(crate) mod tests {
         ElementType, ThriftCompactOutputProtocol, ThriftSliceInputProtocol, 
read_thrift_vec,
     };
     use crate::schema::types::{
-        ColumnDescriptor, SchemaDescriptor, TypePtr, num_nodes, 
parquet_schema_from_array,
+        ColumnDescriptor, ColumnPath, SchemaDescriptor, TypePtr, num_nodes,
+        parquet_schema_from_array,
     };
     use std::sync::Arc;
 
@@ -1828,4 +1829,48 @@ pub(crate) mod tests {
             mmax: Some(42.0.into()),
         });
     }
+
+    #[test]
+    fn test_convert_stats_preserves_missing_null_count() {
+        let primitive =
+            crate::schema::types::Type::primitive_type_builder("col", 
PhysicalType::INT32)
+                .build()
+                .unwrap();
+        let column_descr = Arc::new(ColumnDescriptor::new(
+            Arc::new(primitive),
+            0,
+            0,
+            ColumnPath::new(vec![]),
+        ));
+
+        let none_null_count = super::Statistics {
+            max: None,
+            min: None,
+            null_count: None,
+            distinct_count: None,
+            max_value: None,
+            min_value: None,
+            is_max_value_exact: None,
+            is_min_value_exact: None,
+        };
+        let decoded_none = super::convert_stats(&column_descr, 
Some(none_null_count))
+            .unwrap()
+            .unwrap();
+        assert_eq!(decoded_none.null_count_opt(), None);
+
+        let zero_null_count = super::Statistics {
+            max: None,
+            min: None,
+            null_count: Some(0),
+            distinct_count: None,
+            max_value: None,
+            min_value: None,
+            is_max_value_exact: None,
+            is_min_value_exact: None,
+        };
+        let decoded_zero = super::convert_stats(&column_descr, 
Some(zero_null_count))
+            .unwrap()
+            .unwrap();
+        assert_eq!(decoded_zero.null_count_opt(), Some(0));
+    }
 }
diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs
index a813e82d13..9682fd54b8 100644
--- a/parquet/src/file/statistics.rs
+++ b/parquet/src/file/statistics.rs
@@ -125,19 +125,18 @@ pub(crate) fn from_thrift_page_stats(
 ) -> Result<Option<Statistics>> {
     Ok(match thrift_stats {
         Some(stats) => {
-            // Number of nulls recorded, when it is not available, we just 
mark it as 0.
-            // TODO this should be `None` if there is no information about 
NULLS.
-            // see https://github.com/apache/arrow-rs/pull/6216/files
-            let null_count = stats.null_count.unwrap_or(0);
-
-            if null_count < 0 {
-                return Err(ParquetError::General(format!(
-                    "Statistics null count is negative {null_count}",
-                )));
-            }
-
             // Generic null count.
-            let null_count = Some(null_count as u64);
+            let null_count = stats
+                .null_count
+                .map(|null_count| {
+                    if null_count < 0 {
+                        return Err(ParquetError::General(format!(
+                            "Statistics null count is negative {null_count}",
+                        )));
+                    }
+                    Ok(null_count as u64)
+                })
+                .transpose()?;
             // Generic distinct count (count of distinct values occurring)
             let distinct_count = stats.distinct_count.map(|value| value as 
u64);
             // Whether or not statistics use deprecated min/max fields.
@@ -431,9 +430,20 @@ impl Statistics {
     /// Returns number of null values for the column, if known.
     /// Note that this includes all nulls when column is part of the complex 
type.
     ///
-    /// Note this API returns Some(0) even if the null count was not present
-    /// in the statistics.
-    /// See <https://github.com/apache/arrow-rs/pull/6216/files>
+    /// Note: Versions of this library prior to `58.1.0` returned `0` if the 
null count
+    /// was not available. This method now returns `None` in that case.
+    ///
+    /// Also, versions of this library prior to `53.1.0` did not store a null 
count
+    /// statistic when the null count was `0`.
+    ///
+    /// It is unsound to assume that missing nullcount stats mean the column 
contains no nulls,
+    /// but code that depends on the old behavior can restore it by defaulting 
to zero:
+    ///
+    /// ```no_run
+    /// # use parquet::file::statistics::Statistics;
+    /// # let statistics: Statistics = todo!();
+    /// let null_count = statistics.null_count_opt().unwrap_or(0);
+    /// ```
     pub fn null_count_opt(&self) -> Option<u64> {
         statistics_enum_func![self, null_count_opt]
     }
@@ -1064,21 +1074,7 @@ mod tests {
         let round_tripped = from_thrift_page_stats(Type::BOOLEAN, 
Some(thrift_stats))
             .unwrap()
             .unwrap();
-        // TODO: remove branch when we no longer support assuming 
null_count==None in the thrift
-        // means null_count = Some(0)
-        if null_count.is_none() {
-            assert_ne!(round_tripped, statistics);
-            assert!(round_tripped.null_count_opt().is_some());
-            assert_eq!(round_tripped.null_count_opt(), Some(0));
-            assert_eq!(round_tripped.min_bytes_opt(), 
statistics.min_bytes_opt());
-            assert_eq!(round_tripped.max_bytes_opt(), 
statistics.max_bytes_opt());
-            assert_eq!(
-                round_tripped.distinct_count_opt(),
-                statistics.distinct_count_opt()
-            );
-        } else {
-            assert_eq!(round_tripped, statistics);
-        }
+        assert_eq!(round_tripped, statistics);
     }
 
     fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) 
-> Statistics {

Reply via email to