This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new e5ea152fd3 Parquet: Fix incorrect skipping of RowGroups with NaNs
(#6517)
e5ea152fd3 is described below
commit e5ea152fd3d36852dd2831757c62b21e11dd4bfe
Author: Russell Spitzer <[email protected]>
AuthorDate: Fri Jan 13 11:58:26 2023 -0600
Parquet: Fix incorrect skipping of RowGroups with NaNs (#6517)
---
.../iceberg/data/TestMetricsRowGroupFilter.java | 20 +++++
.../parquet/ParquetMetricsRowGroupFilter.java | 86 ++++++++++++----------
.../iceberg/parquet/TestCDHParquetStatistics.java | 57 +-------------
3 files changed, 70 insertions(+), 93 deletions(-)
diff --git
a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java
b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java
index 74e3a28e3e..ee6b2dd71e 100644
--- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java
+++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java
@@ -300,6 +300,26 @@ public class TestMetricsRowGroupFilter {
Assert.assertTrue("Should read: struct type is not skipped", shouldRead);
}
+ @Test
+ public void testFloatWithNan() {
+ // NaN's should break Parquet's Min/Max stats we should be reading in all
cases
+ // Only ORC should be able to distinguish using min/max when NaN is present
+ boolean shouldRead = shouldRead(greaterThan("some_nans", 1.0));
+ Assert.assertTrue(shouldRead);
+
+ shouldRead = shouldRead(greaterThanOrEqual("some_nans", 1.0));
+ Assert.assertTrue(shouldRead);
+
+ shouldRead = shouldRead(lessThan("some_nans", 3.0));
+ Assert.assertTrue(shouldRead);
+
+ shouldRead = shouldRead(lessThanOrEqual("some_nans", 1.0));
+ Assert.assertTrue(shouldRead);
+
+ shouldRead = shouldRead(equal("some_nans", 2.0));
+ Assert.assertTrue(shouldRead);
+ }
+
@Test
public void testIsNaN() {
boolean shouldRead = shouldRead(isNaN("all_nans"));
diff --git
a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java
b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java
index fdf697a78a..a8d3e5406b 100644
---
a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java
+++
b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java
@@ -212,12 +212,12 @@ public class ParquetMetricsRowGroupFilter {
Statistics<?> colStats = stats.get(id);
if (colStats != null && !colStats.isEmpty()) {
- if (hasNonNullButNoMinMax(colStats, valueCount)) {
- return ROWS_MIGHT_MATCH;
+ if (allNulls(colStats, valueCount)) {
+ return ROWS_CANNOT_MATCH;
}
- if (!colStats.hasNonNullValue()) {
- return ROWS_CANNOT_MATCH;
+ if (minMaxUndefined(colStats)) {
+ return ROWS_MIGHT_MATCH;
}
T lower = min(colStats, id);
@@ -242,12 +242,12 @@ public class ParquetMetricsRowGroupFilter {
Statistics<?> colStats = stats.get(id);
if (colStats != null && !colStats.isEmpty()) {
- if (hasNonNullButNoMinMax(colStats, valueCount)) {
- return ROWS_MIGHT_MATCH;
+ if (allNulls(colStats, valueCount)) {
+ return ROWS_CANNOT_MATCH;
}
- if (!colStats.hasNonNullValue()) {
- return ROWS_CANNOT_MATCH;
+ if (minMaxUndefined(colStats)) {
+ return ROWS_MIGHT_MATCH;
}
T lower = min(colStats, id);
@@ -272,12 +272,12 @@ public class ParquetMetricsRowGroupFilter {
Statistics<?> colStats = stats.get(id);
if (colStats != null && !colStats.isEmpty()) {
- if (hasNonNullButNoMinMax(colStats, valueCount)) {
- return ROWS_MIGHT_MATCH;
+ if (allNulls(colStats, valueCount)) {
+ return ROWS_CANNOT_MATCH;
}
- if (!colStats.hasNonNullValue()) {
- return ROWS_CANNOT_MATCH;
+ if (minMaxUndefined(colStats)) {
+ return ROWS_MIGHT_MATCH;
}
T upper = max(colStats, id);
@@ -302,12 +302,12 @@ public class ParquetMetricsRowGroupFilter {
Statistics<?> colStats = stats.get(id);
if (colStats != null && !colStats.isEmpty()) {
- if (hasNonNullButNoMinMax(colStats, valueCount)) {
- return ROWS_MIGHT_MATCH;
+ if (allNulls(colStats, valueCount)) {
+ return ROWS_CANNOT_MATCH;
}
- if (!colStats.hasNonNullValue()) {
- return ROWS_CANNOT_MATCH;
+ if (minMaxUndefined(colStats)) {
+ return ROWS_MIGHT_MATCH;
}
T upper = max(colStats, id);
@@ -339,12 +339,12 @@ public class ParquetMetricsRowGroupFilter {
Statistics<?> colStats = stats.get(id);
if (colStats != null && !colStats.isEmpty()) {
- if (hasNonNullButNoMinMax(colStats, valueCount)) {
- return ROWS_MIGHT_MATCH;
+ if (allNulls(colStats, valueCount)) {
+ return ROWS_CANNOT_MATCH;
}
- if (!colStats.hasNonNullValue()) {
- return ROWS_CANNOT_MATCH;
+ if (minMaxUndefined(colStats)) {
+ return ROWS_MIGHT_MATCH;
}
T lower = min(colStats, id);
@@ -389,12 +389,12 @@ public class ParquetMetricsRowGroupFilter {
Statistics<?> colStats = stats.get(id);
if (colStats != null && !colStats.isEmpty()) {
- if (hasNonNullButNoMinMax(colStats, valueCount)) {
- return ROWS_MIGHT_MATCH;
+ if (allNulls(colStats, valueCount)) {
+ return ROWS_CANNOT_MATCH;
}
- if (!colStats.hasNonNullValue()) {
- return ROWS_CANNOT_MATCH;
+ if (minMaxUndefined(colStats)) {
+ return ROWS_MIGHT_MATCH;
}
Collection<T> literals = literalSet;
@@ -448,12 +448,12 @@ public class ParquetMetricsRowGroupFilter {
Statistics<Binary> colStats = (Statistics<Binary>) stats.get(id);
if (colStats != null && !colStats.isEmpty()) {
- if (hasNonNullButNoMinMax(colStats, valueCount)) {
- return ROWS_MIGHT_MATCH;
+ if (allNulls(colStats, valueCount)) {
+ return ROWS_CANNOT_MATCH;
}
- if (!colStats.hasNonNullValue()) {
- return ROWS_CANNOT_MATCH;
+ if (minMaxUndefined(colStats)) {
+ return ROWS_MIGHT_MATCH;
}
ByteBuffer prefixAsBytes = lit.toByteBuffer();
@@ -501,7 +501,7 @@ public class ParquetMetricsRowGroupFilter {
return ROWS_MIGHT_MATCH;
}
- if (hasNonNullButNoMinMax(colStats, valueCount)) {
+ if (minMaxUndefined(colStats)) {
return ROWS_MIGHT_MATCH;
}
@@ -560,24 +560,30 @@ public class ParquetMetricsRowGroupFilter {
}
/**
- * Checks against older versions of Parquet statistics which may have a null
count but undefined
- * min and max statistics. Returns true if nonNull values exist in the row
group but no further
- * statistics are available.
- *
- * <p>We can't use {@code statistics.hasNonNullValue()} because it is
inaccurate with older files
- * and will return false if min and max are not set.
+ * Older versions of Parquet statistics which may have a null count but
undefined min and max
+ * statistics. This is similar to the current behavior when NaN values are
present.
*
* <p>This is specifically for 1.5.0-CDH Parquet builds and later which
contain the different
* unusual hasNonNull behavior. OSS Parquet builds are not effected because
PARQUET-251 prohibits
* the reading of these statistics from versions of Parquet earlier than
1.8.0.
*
* @param statistics Statistics to check
- * @param valueCount Number of values in the row group
- * @return true if nonNull values exist and no other stats can be used
+ * @return true if min and max statistics are null
+ */
+ static boolean nullMinMax(Statistics statistics) {
+ return statistics.getMaxBytes() == null || statistics.getMinBytes() ==
null;
+ }
+
+ /**
+ * The internal logic of Parquet-MR says that if numNulls is set but
hasNonNull value is false,
+ * then the min/max of the column are undefined.
*/
- static boolean hasNonNullButNoMinMax(Statistics statistics, long valueCount)
{
- return statistics.getNumNulls() < valueCount
- && (statistics.getMaxBytes() == null || statistics.getMinBytes() ==
null);
+ static boolean minMaxUndefined(Statistics statistics) {
+ return (statistics.isNumNullsSet() && !statistics.hasNonNullValue()) ||
nullMinMax(statistics);
+ }
+
+ static boolean allNulls(Statistics statistics, long valueCount) {
+ return statistics.isNumNullsSet() && valueCount ==
statistics.getNumNulls();
}
private static boolean mayContainNull(Statistics statistics) {
diff --git
a/parquet/src/test/java/org/apache/iceberg/parquet/TestCDHParquetStatistics.java
b/parquet/src/test/java/org/apache/iceberg/parquet/TestCDHParquetStatistics.java
index 1ceca74d21..ed400d2877 100644
---
a/parquet/src/test/java/org/apache/iceberg/parquet/TestCDHParquetStatistics.java
+++
b/parquet/src/test/java/org/apache/iceberg/parquet/TestCDHParquetStatistics.java
@@ -27,67 +27,18 @@ import org.junit.Test;
/**
* Tests for Parquet 1.5.0-Stats which cannot be evaluated like later versions
of Parquet stats.
- * They are intercepted by the hasNonNullButNoMinMax function which always
returns ROWS_MAY_MATCH
+ * They are intercepted by the minMaxUndefined function which always returns
ROWS_MAY_MATCH
*/
public class TestCDHParquetStatistics {
@Test
public void testCDHParquetStatistcs() {
Statistics cdhBinaryColumnStats = mock(Statistics.class);
+ when(cdhBinaryColumnStats.isEmpty()).thenReturn(false);
+ when(cdhBinaryColumnStats.hasNonNullValue()).thenReturn(true);
when(cdhBinaryColumnStats.getMaxBytes()).thenReturn(null);
when(cdhBinaryColumnStats.getMinBytes()).thenReturn(null);
when(cdhBinaryColumnStats.getNumNulls()).thenReturn(0L);
- Assert.assertTrue(
-
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L));
- }
-
- @Test
- public void testCDHParquetStatisticsNullNotSet() {
- Statistics cdhBinaryColumnStats = mock(Statistics.class);
- when(cdhBinaryColumnStats.getMaxBytes()).thenReturn(null);
- when(cdhBinaryColumnStats.getMinBytes()).thenReturn(null);
- when(cdhBinaryColumnStats.getNumNulls()).thenReturn(-1L);
- Assert.assertTrue(
-
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L));
- }
-
- @Test
- public void testCDHParquetStatistcsAllNull() {
- Statistics cdhBinaryColumnStats = mock(Statistics.class);
- when(cdhBinaryColumnStats.getMaxBytes()).thenReturn(null);
- when(cdhBinaryColumnStats.getMinBytes()).thenReturn(null);
- when(cdhBinaryColumnStats.getNumNulls()).thenReturn(50L);
- Assert.assertFalse(
-
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L));
- }
-
- @Test
- public void testNonCDHParquetStatistics() {
- Statistics normalBinaryColumnStats = mock(Statistics.class);
- when(normalBinaryColumnStats.getMaxBytes()).thenReturn(new byte[2]);
- when(normalBinaryColumnStats.getMinBytes()).thenReturn(new byte[2]);
- when(normalBinaryColumnStats.getNumNulls()).thenReturn(0L);
- Assert.assertFalse(
-
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats,
50L));
- }
-
- @Test
- public void testNonCDHParquetStatisticsNullNotSet() {
- Statistics normalBinaryColumnStats = mock(Statistics.class);
- when(normalBinaryColumnStats.getMaxBytes()).thenReturn(new byte[2]);
- when(normalBinaryColumnStats.getMinBytes()).thenReturn(new byte[2]);
- when(normalBinaryColumnStats.getNumNulls()).thenReturn(-1L);
- Assert.assertFalse(
-
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats,
50L));
- }
-
- @Test
- public void testNonCDHParquetStatisticsAllNull() {
- Statistics normalBinaryColumnStats = mock(Statistics.class);
- when(normalBinaryColumnStats.getMaxBytes()).thenReturn(new byte[2]);
- when(normalBinaryColumnStats.getMinBytes()).thenReturn(new byte[2]);
- when(normalBinaryColumnStats.getNumNulls()).thenReturn(50L);
- Assert.assertFalse(
-
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats,
50L));
+
Assert.assertTrue(ParquetMetricsRowGroupFilter.minMaxUndefined(cdhBinaryColumnStats));
}
}