alamb commented on code in PR #9619:
URL: https://github.com/apache/arrow-rs/pull/9619#discussion_r3318114658
##########
parquet/src/column/writer/mod.rs:
##########
@@ -2795,6 +2861,139 @@ mod tests {
}
}
+ #[test]
+ fn test_ieee754_total_order_float() {
+ // Test IEEE 754 total order for f32
+ // Order should be: -NaN < -Inf < -1.0 < -0.0 < +0.0 < 1.0 < +Inf <
+NaN
+ let neg_nan = f32::from_bits(0xffc00000);
+ let neg_inf = f32::NEG_INFINITY;
+ let neg_one = -1.0_f32;
+ let neg_zero = -0.0_f32;
+ let pos_zero = 0.0_f32;
+ let pos_one = 1.0_f32;
+ let pos_inf = f32::INFINITY;
+ let pos_nan = f32::from_bits(0x7fc00000);
+
+ let values = vec![
+ pos_nan, neg_zero, pos_inf, neg_one, neg_nan, pos_one, neg_inf,
pos_zero,
+ ];
+
+ let stats = statistics_roundtrip::<FloatType>(&values);
+ if let Statistics::Float(stats) = stats {
+ // With IEEE 754 total order, min should be -NaN, max should be
+NaN
+ // But since we filter out NaN values, min should be -Inf, max
should be +Inf
+ assert_eq!(stats.min_opt().unwrap(), &neg_inf);
+ assert_eq!(stats.max_opt().unwrap(), &pos_inf);
+ assert_eq!(stats.nan_count_opt(), Some(2)); // neg_nan and pos_nan
+ } else {
+ panic!("Expected float statistics");
+ }
+ }
+
+ #[test]
+ fn test_ieee754_total_order_float_only_nan() {
+ // Test IEEE 754 total order for f32
+ // Order should be: -NaN < -Inf < -1.0 < -0.0 < +0.0 < 1.0 < +Inf <
+NaN
Review Comment:
This comment seems incorrect as the test is for different nan encodings
##########
parquet/src/column/writer/mod.rs:
##########
@@ -2795,6 +2861,139 @@ mod tests {
}
}
+ #[test]
+ fn test_ieee754_total_order_float() {
+ // Test IEEE 754 total order for f32
+ // Order should be: -NaN < -Inf < -1.0 < -0.0 < +0.0 < 1.0 < +Inf <
+NaN
+ let neg_nan = f32::from_bits(0xffc00000);
+ let neg_inf = f32::NEG_INFINITY;
+ let neg_one = -1.0_f32;
+ let neg_zero = -0.0_f32;
+ let pos_zero = 0.0_f32;
+ let pos_one = 1.0_f32;
+ let pos_inf = f32::INFINITY;
+ let pos_nan = f32::from_bits(0x7fc00000);
+
+ let values = vec![
+ pos_nan, neg_zero, pos_inf, neg_one, neg_nan, pos_one, neg_inf,
pos_zero,
+ ];
+
+ let stats = statistics_roundtrip::<FloatType>(&values);
+ if let Statistics::Float(stats) = stats {
+ // With IEEE 754 total order, min should be -NaN, max should be
+NaN
+ // But since we filter out NaN values, min should be -Inf, max
should be +Inf
+ assert_eq!(stats.min_opt().unwrap(), &neg_inf);
+ assert_eq!(stats.max_opt().unwrap(), &pos_inf);
+ assert_eq!(stats.nan_count_opt(), Some(2)); // neg_nan and pos_nan
+ } else {
+ panic!("Expected float statistics");
+ }
+ }
+
+ #[test]
+ fn test_ieee754_total_order_float_only_nan() {
+ // Test IEEE 754 total order for f32
+ // Order should be: -NaN < -Inf < -1.0 < -0.0 < +0.0 < 1.0 < +Inf <
+NaN
+ let neg_nan1 = f32::from_bits(0xffc00000);
Review Comment:
TIL there are multiple encodings of NaN: https://en.wikipedia.org/wiki/NaN
##########
parquet/src/basic.rs:
##########
@@ -1025,14 +1029,36 @@ impl ColumnOrder {
converted_type: ConvertedType,
physical_type: Type,
) -> SortOrder {
- Self::sort_order_for_type(logical_type.as_ref(), converted_type,
physical_type)
+ Self::column_order_for_type(logical_type.as_ref(), converted_type,
physical_type)
+ .sort_order()
+ }
+
+ /// Returns the `ColumnOrder` for a physical/logical type.
+ pub fn column_order_for_type(
+ logical_type: Option<&LogicalType>,
+ converted_type: ConvertedType,
+ physical_type: Type,
+ ) -> ColumnOrder {
+ if Some(&LogicalType::Float16) == logical_type
+ || matches!(physical_type, Type::FLOAT | Type::DOUBLE)
+ {
+ ColumnOrder::IEEE_754_TOTAL_ORDER
+ } else {
+ let sort_order =
+ Self::sort_order_for_type(logical_type, converted_type,
physical_type, true);
+ ColumnOrder::TYPE_DEFINED_ORDER(sort_order)
+ }
}
/// Returns sort order for a physical/logical type.
+ ///
+ /// `is_type_defined` indicates whether the column order for this type is
Review Comment:
Minor nit: why not just pass the actual ColumOrder enum (rather than a
flag?) That would perhaps make the code more self documenting as well as allow
extension in the future without breaking API changes
##########
parquet/tests/arrow_reader/statistics.rs:
##########
@@ -781,7 +781,7 @@ async fn test_float_16() {
expected_min: Arc::new(Float16Array::from(vec![
f16::from_f32(-5.),
f16::from_f32(-4.),
- f16::from_f32(-0.),
+ f16::from_f32(0.),
Review Comment:
why change the tests? If the min really is -0 shouldn't that be reflected?
##########
parquet/src/column/writer/mod.rs:
##########
@@ -900,6 +904,27 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E>
{
Ok(())
}
+ // For float columns, always provide Some(n), even if n is 0
+ // For non-float columns, always provide None
+ fn get_nan_count<T: ParquetValueType>(&self) -> Option<i64> {
+ let nan_count = || {
+ let nan_count = self.page_metrics.num_page_nans.unwrap_or(0);
+ match i64::try_from(nan_count) {
+ Ok(count) => Some(count),
+ _ => Some(i64::MAX),
Review Comment:
This is probably ok -- another perhaps safer thing would be to return `None`
if the value overflows i64
I doubt there is any actual dataset that has than i64::MAX values -- but I
can imagine an invalid / adversarial input (or something that underflowed on
the writer) could have such a value 🤷
##########
parquet/tests/ieee754_nan_interop.rs:
##########
@@ -0,0 +1,448 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Interoperability test for
https://github.com/apache/parquet-format/pull/514.
+//! Demonstrate reading NaN statstics and counts from a file generated with
+//! parquet-java, and show that on write we produce the same statistics.
+
+use bytes::Bytes;
+use core::f32;
+use half::f16;
+use std::{path::PathBuf, sync::Arc};
+
+use arrow::util::test_util::parquet_test_data;
+use arrow_array::{Array, Float16Array, Float32Array, Float64Array,
RecordBatch, UInt64Array};
+use arrow_schema::{DataType, Field, Schema};
+use parquet::{
+ arrow::{
+ ArrowWriter,
+ arrow_reader::{ArrowReaderBuilder, ArrowReaderOptions,
statistics::StatisticsConverter},
+ },
+ errors::Result,
+ file::{metadata::ParquetMetaData, properties::WriterProperties},
+ schema::types::SchemaDescriptor,
+};
+
+const NAN_COUNTS: [u64; 5] = [0, 4, 10, 0, 0];
+
+const FLOAT_NEG_NAN_SMALL: f32 = f32::from_bits(0xffffffff);
+const FLOAT_NEG_NAN_LARGE: f32 = f32::from_bits(0xfff00001);
+const FLOAT_NAN_SMALL: f32 = f32::from_bits(0x7fc00001);
+const FLOAT_NAN_LARGE: f32 = f32::from_bits(0x7fffffff);
+
+const FLOAT_MINS: [f32; 5] = [-2.0, -2.0, FLOAT_NEG_NAN_SMALL, 0.0, -5.0];
+const FLOAT_MAXS: [f32; 5] = [5.0, 3.0, FLOAT_NAN_LARGE, 5.0, -0.0];
+
+fn validate_float_metadata(
+ metadata: &ParquetMetaData,
+ arrow_schema: &Schema,
+ parquet_schema: &SchemaDescriptor,
+) -> Result<()> {
+ let converter = StatisticsConverter::try_new("float_ieee754",
arrow_schema, parquet_schema)?;
+ let row_group_indices: Vec<_> = (0..metadata.num_row_groups()).collect();
+
+ // verify column statistics mins
+ let exp: Arc<dyn Array> =
Arc::new(Float32Array::from(FLOAT_MINS.to_vec()));
+ let mins = converter.row_group_mins(metadata.row_groups())?;
+ assert_eq!(&mins, &exp);
+
+ // verify page mins (should be 1 page per row group, so should be same)
+ let page_mins = converter.data_page_mins(
+ metadata.column_index().unwrap(),
+ metadata.offset_index().unwrap(),
+ &row_group_indices,
+ )?;
+ assert_eq!(&page_mins, &exp);
+
+ let exp: Arc<dyn Array> =
Arc::new(Float32Array::from(FLOAT_MAXS.to_vec()));
+ let maxs = converter.row_group_maxes(metadata.row_groups())?;
+ assert_eq!(&maxs, &exp);
+
+ // verify page maxs (should be 1 page per row group, so should be same)
+ let page_maxs = converter.data_page_maxes(
+ metadata.column_index().unwrap(),
+ metadata.offset_index().unwrap(),
+ &row_group_indices,
+ )?;
+ assert_eq!(&page_maxs, &exp);
+
+ let exp = UInt64Array::from(NAN_COUNTS.to_vec());
+ let nans = converter.row_group_nan_counts(metadata.row_groups())?;
+ assert_eq!(&nans, &exp);
+
+ let page_nans = converter.data_page_nan_counts(
+ metadata.column_index().unwrap(),
+ metadata.offset_index().unwrap(),
+ &row_group_indices,
+ )?;
+ assert_eq!(&page_nans, &exp);
+
+ Ok(())
+}
+
+const DOUBLE_NEG_NAN_SMALL: f64 = f64::from_bits(0xffffffffffffffff);
+const DOUBLE_NEG_NAN_LARGE: f64 = f64::from_bits(0xfff0000000000001);
+const DOUBLE_NAN_SMALL: f64 = f64::from_bits(0x7ff0000000000001);
+const DOUBLE_NAN_LARGE: f64 = f64::from_bits(0x7fffffffffffffff);
+
+const DOUBLE_MINS: [f64; 5] = [-2.0, -2.0, DOUBLE_NEG_NAN_SMALL, 0.0, -5.0];
+const DOUBLE_MAXS: [f64; 5] = [5.0, 3.0, DOUBLE_NAN_LARGE, 5.0, -0.0];
+
+fn validate_double_metadata(
+ metadata: &ParquetMetaData,
+ arrow_schema: &Schema,
+ parquet_schema: &SchemaDescriptor,
+) -> Result<()> {
+ let converter = StatisticsConverter::try_new("double_ieee754",
arrow_schema, parquet_schema)?;
+ let row_group_indices: Vec<_> = (0..metadata.num_row_groups()).collect();
+
+ // verify column statistics mins
+ let exp: Arc<dyn Array> =
Arc::new(Float64Array::from(DOUBLE_MINS.to_vec()));
+ let mins = converter.row_group_mins(metadata.row_groups())?;
+ assert_eq!(&mins, &exp);
+
+ // verify page mins (should be 1 page per row group, so should be same)
+ let page_mins = converter.data_page_mins(
+ metadata.column_index().unwrap(),
+ metadata.offset_index().unwrap(),
+ &row_group_indices,
+ )?;
+ assert_eq!(&page_mins, &exp);
+
+ let exp: Arc<dyn Array> =
Arc::new(Float64Array::from(DOUBLE_MAXS.to_vec()));
+ let maxs = converter.row_group_maxes(metadata.row_groups())?;
+ assert_eq!(&maxs, &exp);
+
+ // verify page maxs (should be 1 page per row group, so should be same)
+ let page_maxs = converter.data_page_maxes(
+ metadata.column_index().unwrap(),
+ metadata.offset_index().unwrap(),
+ &row_group_indices,
+ )?;
+ assert_eq!(&page_maxs, &exp);
+
+ let exp = UInt64Array::from(NAN_COUNTS.to_vec());
+ let nans = converter.row_group_nan_counts(metadata.row_groups())?;
+ assert_eq!(&nans, &exp);
+
+ let page_nans = converter.data_page_nan_counts(
+ metadata.column_index().unwrap(),
+ metadata.offset_index().unwrap(),
+ &row_group_indices,
+ )?;
+ assert_eq!(&page_nans, &exp);
+
+ Ok(())
+}
+
+const FLOAT16_NEG_NAN_SMALL: f16 = f16::from_bits(0xffff);
+const FLOAT16_NEG_NAN_LARGE: f16 = f16::from_bits(0xfc01);
+const FLOAT16_NAN_SMALL: f16 = f16::from_bits(0x7c01);
+const FLOAT16_NAN_LARGE: f16 = f16::from_bits(0x7fff);
+
+const FLOAT16_MINS: [f16; 5] = [
+ f16::from_bits(0xc000),
+ f16::from_bits(0xc000),
+ FLOAT16_NEG_NAN_SMALL,
+ f16::from_bits(0x0000),
+ f16::from_bits(0xc500),
+];
+const FLOAT16_MAXS: [f16; 5] = [
+ f16::from_bits(0x4500),
+ f16::from_bits(0x4200),
+ FLOAT16_NAN_LARGE,
+ f16::from_bits(0x4500),
+ f16::from_bits(0x8000),
+];
+
+fn validate_float16_metadata(
Review Comment:
It is unfortunate we have ~ 3 copies of the same code with slight
differences. However the effort to macroize or template the thing t handle the
different floating point types is probably not worth it
##########
parquet/tests/arrow_reader/bad_data.rs:
##########
@@ -132,6 +133,15 @@ fn test_arrow_rs_gh_45185_dict_levels() {
);
}
+#[test]
+fn test_arrow_gh_47662() {
Review Comment:
Looks like this is a reasonable behavior for this file. Here is the issue
- https://github.com/apache/arrow/issues/47662
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]