Re: [PR] Remove duplicated statistics tests in parquet [arrow-rs]

via GitHub Tue, 06 Aug 2024 06:00:50 -0700


alamb commented on code in PR #6190:
URL: https://github.com/apache/arrow-rs/pull/6190#discussion_r1705494723



##########
parquet/tests/arrow_reader/statistics.rs:
##########
@@ -2195,3 +2195,429 @@ async fn test_column_non_existent() {
     }
     .run_with_schema(&schema);
 }
+
+/// The following tests were initially in 
`arrow-rs/parquet/src/arrow/arrow_reader/statistics.rs`.
+/// Part of them was moved here to avoid confusion and duplication,
+/// including edge conditions and data file validations for only `row_group` 
statistics.
+#[cfg(test)]
+mod test {
+    use super::*;
+    use arrow::util::test_util::parquet_test_data;
+    use arrow_array::{
+        new_empty_array, ArrayRef, BooleanArray, Decimal128Array, 
Float32Array, Float64Array,
+        Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, 
StringArray,
+        TimestampNanosecondArray,
+    };
+    use arrow_schema::{DataType, SchemaRef, TimeUnit};
+    use bytes::Bytes;
+    use parquet::arrow::parquet_column;
+    use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
+    use std::path::PathBuf;
+    use std::sync::Arc;
+    // TODO error cases (with parquet statistics that are mismatched in 
expected type)
+
+    #[test]
+    fn roundtrip_empty() {
+        let all_types = vec![
+            DataType::Null,
+            DataType::Boolean,
+            DataType::Int8,
+            DataType::Int16,
+            DataType::Int32,
+            DataType::Int64,
+            DataType::UInt8,
+            DataType::UInt16,
+            DataType::UInt32,
+            DataType::UInt64,
+            DataType::Float16,
+            DataType::Float32,
+            DataType::Float64,
+            DataType::Timestamp(TimeUnit::Second, None),
+            DataType::Date32,
+            DataType::Date64,
+            // DataType::Time32(Second),
+            // DataType::Time64(Second),
+            // DataType::Duration(Second),
+            // DataType::Interval(IntervalUnit),
+            DataType::Binary,
+            DataType::FixedSizeBinary(0),
+            DataType::LargeBinary,
+            DataType::BinaryView,
+            DataType::Utf8,
+            DataType::LargeUtf8,
+            DataType::Utf8View,
+            // DataType::List(FieldRef),
+            // DataType::ListView(FieldRef),
+            // DataType::FixedSizeList(FieldRef, i32),
+            // DataType::LargeList(FieldRef),
+            // DataType::LargeListView(FieldRef),
+            // DataType::Struct(Fields),
+            // DataType::Union(UnionFields, UnionMode),
+            // DataType::Dictionary(Box<DataType>, Box<DataType>),
+            // DataType::Decimal128(u8, i8),
+            // DataType::Decimal256(u8, i8),
+            // DataType::Map(FieldRef, bool),
+            // DataType::RunEndEncoded(FieldRef, FieldRef),
+        ];
+        for data_type in all_types {
+            let empty_array = new_empty_array(&data_type);
+            Test {
+                input: empty_array.clone(),
+                expected_min: empty_array.clone(),
+                expected_max: empty_array,
+            }
+            .run();
+        }
+    }
+
+    #[test]
+    fn nan_in_stats() {
+        // /parquet-testing/data/nan_in_stats.parquet
+        // row_groups: 1
+        // "x": Double({min: Some(1.0), max: Some(NaN), distinct_count: None, 
null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
+
+        TestFile::new("nan_in_stats.parquet")
+            .with_column(ExpectedColumn {
+                name: "x",
+                expected_min: Arc::new(Float64Array::from(vec![Some(1.0)])),
+                expected_max: 
Arc::new(Float64Array::from(vec![Some(f64::NAN)])),
+            })
+            .run();
+    }
+
+    #[test]

Review Comment:
   👍 



##########
parquet/tests/arrow_reader/statistics.rs:
##########
@@ -2195,3 +2195,429 @@ async fn test_column_non_existent() {
     }
     .run_with_schema(&schema);
 }
+
+/// The following tests were initially in 
`arrow-rs/parquet/src/arrow/arrow_reader/statistics.rs`.
+/// Part of them was moved here to avoid confusion and duplication,
+/// including edge conditions and data file validations for only `row_group` 
statistics.
+#[cfg(test)]
+mod test {
+    use super::*;
+    use arrow::util::test_util::parquet_test_data;
+    use arrow_array::{
+        new_empty_array, ArrayRef, BooleanArray, Decimal128Array, 
Float32Array, Float64Array,
+        Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, 
StringArray,
+        TimestampNanosecondArray,
+    };
+    use arrow_schema::{DataType, SchemaRef, TimeUnit};
+    use bytes::Bytes;
+    use parquet::arrow::parquet_column;
+    use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
+    use std::path::PathBuf;
+    use std::sync::Arc;
+    // TODO error cases (with parquet statistics that are mismatched in 
expected type)
+
+    #[test]
+    fn roundtrip_empty() {
+        let all_types = vec![
+            DataType::Null,
+            DataType::Boolean,
+            DataType::Int8,
+            DataType::Int16,
+            DataType::Int32,
+            DataType::Int64,
+            DataType::UInt8,
+            DataType::UInt16,
+            DataType::UInt32,
+            DataType::UInt64,
+            DataType::Float16,
+            DataType::Float32,
+            DataType::Float64,
+            DataType::Timestamp(TimeUnit::Second, None),
+            DataType::Date32,
+            DataType::Date64,
+            // DataType::Time32(Second),

Review Comment:
   Is there any reason not to include these types in the round trip tests? Is 
it because they don't support statistics?
   
   If so, maybe a comment would help like
   ```suggestion
               // Skip types that don't support statistics
               // DataType::Time32(Second),
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Remove duplicated statistics tests in parquet [arrow-rs]

Reply via email to