This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new e5ea152fd3 Parquet: Fix incorrect skipping of RowGroups with NaNs 
(#6517)
e5ea152fd3 is described below

commit e5ea152fd3d36852dd2831757c62b21e11dd4bfe
Author: Russell Spitzer <[email protected]>
AuthorDate: Fri Jan 13 11:58:26 2023 -0600

    Parquet: Fix incorrect skipping of RowGroups with NaNs (#6517)
---
 .../iceberg/data/TestMetricsRowGroupFilter.java    | 20 +++++
 .../parquet/ParquetMetricsRowGroupFilter.java      | 86 ++++++++++++----------
 .../iceberg/parquet/TestCDHParquetStatistics.java  | 57 +-------------
 3 files changed, 70 insertions(+), 93 deletions(-)

diff --git 
a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java 
b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java
index 74e3a28e3e..ee6b2dd71e 100644
--- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java
+++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java
@@ -300,6 +300,26 @@ public class TestMetricsRowGroupFilter {
     Assert.assertTrue("Should read: struct type is not skipped", shouldRead);
   }
 
+  @Test
+  public void testFloatWithNan() {
+    // NaN's should break Parquet's Min/Max stats we should be reading in all 
cases
+    // Only ORC should be able to distinguish using min/max when NaN is present
+    boolean shouldRead = shouldRead(greaterThan("some_nans", 1.0));
+    Assert.assertTrue(shouldRead);
+
+    shouldRead = shouldRead(greaterThanOrEqual("some_nans", 1.0));
+    Assert.assertTrue(shouldRead);
+
+    shouldRead = shouldRead(lessThan("some_nans", 3.0));
+    Assert.assertTrue(shouldRead);
+
+    shouldRead = shouldRead(lessThanOrEqual("some_nans", 1.0));
+    Assert.assertTrue(shouldRead);
+
+    shouldRead = shouldRead(equal("some_nans", 2.0));
+    Assert.assertTrue(shouldRead);
+  }
+
   @Test
   public void testIsNaN() {
     boolean shouldRead = shouldRead(isNaN("all_nans"));
diff --git 
a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java
 
b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java
index fdf697a78a..a8d3e5406b 100644
--- 
a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java
+++ 
b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java
@@ -212,12 +212,12 @@ public class ParquetMetricsRowGroupFilter {
 
       Statistics<?> colStats = stats.get(id);
       if (colStats != null && !colStats.isEmpty()) {
-        if (hasNonNullButNoMinMax(colStats, valueCount)) {
-          return ROWS_MIGHT_MATCH;
+        if (allNulls(colStats, valueCount)) {
+          return ROWS_CANNOT_MATCH;
         }
 
-        if (!colStats.hasNonNullValue()) {
-          return ROWS_CANNOT_MATCH;
+        if (minMaxUndefined(colStats)) {
+          return ROWS_MIGHT_MATCH;
         }
 
         T lower = min(colStats, id);
@@ -242,12 +242,12 @@ public class ParquetMetricsRowGroupFilter {
 
       Statistics<?> colStats = stats.get(id);
       if (colStats != null && !colStats.isEmpty()) {
-        if (hasNonNullButNoMinMax(colStats, valueCount)) {
-          return ROWS_MIGHT_MATCH;
+        if (allNulls(colStats, valueCount)) {
+          return ROWS_CANNOT_MATCH;
         }
 
-        if (!colStats.hasNonNullValue()) {
-          return ROWS_CANNOT_MATCH;
+        if (minMaxUndefined(colStats)) {
+          return ROWS_MIGHT_MATCH;
         }
 
         T lower = min(colStats, id);
@@ -272,12 +272,12 @@ public class ParquetMetricsRowGroupFilter {
 
       Statistics<?> colStats = stats.get(id);
       if (colStats != null && !colStats.isEmpty()) {
-        if (hasNonNullButNoMinMax(colStats, valueCount)) {
-          return ROWS_MIGHT_MATCH;
+        if (allNulls(colStats, valueCount)) {
+          return ROWS_CANNOT_MATCH;
         }
 
-        if (!colStats.hasNonNullValue()) {
-          return ROWS_CANNOT_MATCH;
+        if (minMaxUndefined(colStats)) {
+          return ROWS_MIGHT_MATCH;
         }
 
         T upper = max(colStats, id);
@@ -302,12 +302,12 @@ public class ParquetMetricsRowGroupFilter {
 
       Statistics<?> colStats = stats.get(id);
       if (colStats != null && !colStats.isEmpty()) {
-        if (hasNonNullButNoMinMax(colStats, valueCount)) {
-          return ROWS_MIGHT_MATCH;
+        if (allNulls(colStats, valueCount)) {
+          return ROWS_CANNOT_MATCH;
         }
 
-        if (!colStats.hasNonNullValue()) {
-          return ROWS_CANNOT_MATCH;
+        if (minMaxUndefined(colStats)) {
+          return ROWS_MIGHT_MATCH;
         }
 
         T upper = max(colStats, id);
@@ -339,12 +339,12 @@ public class ParquetMetricsRowGroupFilter {
 
       Statistics<?> colStats = stats.get(id);
       if (colStats != null && !colStats.isEmpty()) {
-        if (hasNonNullButNoMinMax(colStats, valueCount)) {
-          return ROWS_MIGHT_MATCH;
+        if (allNulls(colStats, valueCount)) {
+          return ROWS_CANNOT_MATCH;
         }
 
-        if (!colStats.hasNonNullValue()) {
-          return ROWS_CANNOT_MATCH;
+        if (minMaxUndefined(colStats)) {
+          return ROWS_MIGHT_MATCH;
         }
 
         T lower = min(colStats, id);
@@ -389,12 +389,12 @@ public class ParquetMetricsRowGroupFilter {
 
       Statistics<?> colStats = stats.get(id);
       if (colStats != null && !colStats.isEmpty()) {
-        if (hasNonNullButNoMinMax(colStats, valueCount)) {
-          return ROWS_MIGHT_MATCH;
+        if (allNulls(colStats, valueCount)) {
+          return ROWS_CANNOT_MATCH;
         }
 
-        if (!colStats.hasNonNullValue()) {
-          return ROWS_CANNOT_MATCH;
+        if (minMaxUndefined(colStats)) {
+          return ROWS_MIGHT_MATCH;
         }
 
         Collection<T> literals = literalSet;
@@ -448,12 +448,12 @@ public class ParquetMetricsRowGroupFilter {
 
       Statistics<Binary> colStats = (Statistics<Binary>) stats.get(id);
       if (colStats != null && !colStats.isEmpty()) {
-        if (hasNonNullButNoMinMax(colStats, valueCount)) {
-          return ROWS_MIGHT_MATCH;
+        if (allNulls(colStats, valueCount)) {
+          return ROWS_CANNOT_MATCH;
         }
 
-        if (!colStats.hasNonNullValue()) {
-          return ROWS_CANNOT_MATCH;
+        if (minMaxUndefined(colStats)) {
+          return ROWS_MIGHT_MATCH;
         }
 
         ByteBuffer prefixAsBytes = lit.toByteBuffer();
@@ -501,7 +501,7 @@ public class ParquetMetricsRowGroupFilter {
           return ROWS_MIGHT_MATCH;
         }
 
-        if (hasNonNullButNoMinMax(colStats, valueCount)) {
+        if (minMaxUndefined(colStats)) {
           return ROWS_MIGHT_MATCH;
         }
 
@@ -560,24 +560,30 @@ public class ParquetMetricsRowGroupFilter {
   }
 
   /**
-   * Checks against older versions of Parquet statistics which may have a null 
count but undefined
-   * min and max statistics. Returns true if nonNull values exist in the row 
group but no further
-   * statistics are available.
-   *
-   * <p>We can't use {@code statistics.hasNonNullValue()} because it is 
inaccurate with older files
-   * and will return false if min and max are not set.
+   * Older versions of Parquet statistics which may have a null count but 
undefined min and max
+   * statistics. This is similar to the current behavior when NaN values are 
present.
    *
    * <p>This is specifically for 1.5.0-CDH Parquet builds and later which 
contain the different
    * unusual hasNonNull behavior. OSS Parquet builds are not effected because 
PARQUET-251 prohibits
    * the reading of these statistics from versions of Parquet earlier than 
1.8.0.
    *
    * @param statistics Statistics to check
-   * @param valueCount Number of values in the row group
-   * @return true if nonNull values exist and no other stats can be used
+   * @return true if min and max statistics are null
+   */
+  static boolean nullMinMax(Statistics statistics) {
+    return statistics.getMaxBytes() == null || statistics.getMinBytes() == 
null;
+  }
+
+  /**
+   * The internal logic of Parquet-MR says that if numNulls is set but 
hasNonNull value is false,
+   * then the min/max of the column are undefined.
    */
-  static boolean hasNonNullButNoMinMax(Statistics statistics, long valueCount) 
{
-    return statistics.getNumNulls() < valueCount
-        && (statistics.getMaxBytes() == null || statistics.getMinBytes() == 
null);
+  static boolean minMaxUndefined(Statistics statistics) {
+    return (statistics.isNumNullsSet() && !statistics.hasNonNullValue()) || 
nullMinMax(statistics);
+  }
+
+  static boolean allNulls(Statistics statistics, long valueCount) {
+    return statistics.isNumNullsSet() && valueCount == 
statistics.getNumNulls();
   }
 
   private static boolean mayContainNull(Statistics statistics) {
diff --git 
a/parquet/src/test/java/org/apache/iceberg/parquet/TestCDHParquetStatistics.java
 
b/parquet/src/test/java/org/apache/iceberg/parquet/TestCDHParquetStatistics.java
index 1ceca74d21..ed400d2877 100644
--- 
a/parquet/src/test/java/org/apache/iceberg/parquet/TestCDHParquetStatistics.java
+++ 
b/parquet/src/test/java/org/apache/iceberg/parquet/TestCDHParquetStatistics.java
@@ -27,67 +27,18 @@ import org.junit.Test;
 
 /**
  * Tests for Parquet 1.5.0-Stats which cannot be evaluated like later versions 
of Parquet stats.
- * They are intercepted by the hasNonNullButNoMinMax function which always 
returns ROWS_MAY_MATCH
+ * They are intercepted by the minMaxUndefined function which always returns 
ROWS_MAY_MATCH
  */
 public class TestCDHParquetStatistics {
 
   @Test
   public void testCDHParquetStatistcs() {
     Statistics cdhBinaryColumnStats = mock(Statistics.class);
+    when(cdhBinaryColumnStats.isEmpty()).thenReturn(false);
+    when(cdhBinaryColumnStats.hasNonNullValue()).thenReturn(true);
     when(cdhBinaryColumnStats.getMaxBytes()).thenReturn(null);
     when(cdhBinaryColumnStats.getMinBytes()).thenReturn(null);
     when(cdhBinaryColumnStats.getNumNulls()).thenReturn(0L);
-    Assert.assertTrue(
-        
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L));
-  }
-
-  @Test
-  public void testCDHParquetStatisticsNullNotSet() {
-    Statistics cdhBinaryColumnStats = mock(Statistics.class);
-    when(cdhBinaryColumnStats.getMaxBytes()).thenReturn(null);
-    when(cdhBinaryColumnStats.getMinBytes()).thenReturn(null);
-    when(cdhBinaryColumnStats.getNumNulls()).thenReturn(-1L);
-    Assert.assertTrue(
-        
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L));
-  }
-
-  @Test
-  public void testCDHParquetStatistcsAllNull() {
-    Statistics cdhBinaryColumnStats = mock(Statistics.class);
-    when(cdhBinaryColumnStats.getMaxBytes()).thenReturn(null);
-    when(cdhBinaryColumnStats.getMinBytes()).thenReturn(null);
-    when(cdhBinaryColumnStats.getNumNulls()).thenReturn(50L);
-    Assert.assertFalse(
-        
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L));
-  }
-
-  @Test
-  public void testNonCDHParquetStatistics() {
-    Statistics normalBinaryColumnStats = mock(Statistics.class);
-    when(normalBinaryColumnStats.getMaxBytes()).thenReturn(new byte[2]);
-    when(normalBinaryColumnStats.getMinBytes()).thenReturn(new byte[2]);
-    when(normalBinaryColumnStats.getNumNulls()).thenReturn(0L);
-    Assert.assertFalse(
-        
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats, 
50L));
-  }
-
-  @Test
-  public void testNonCDHParquetStatisticsNullNotSet() {
-    Statistics normalBinaryColumnStats = mock(Statistics.class);
-    when(normalBinaryColumnStats.getMaxBytes()).thenReturn(new byte[2]);
-    when(normalBinaryColumnStats.getMinBytes()).thenReturn(new byte[2]);
-    when(normalBinaryColumnStats.getNumNulls()).thenReturn(-1L);
-    Assert.assertFalse(
-        
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats, 
50L));
-  }
-
-  @Test
-  public void testNonCDHParquetStatisticsAllNull() {
-    Statistics normalBinaryColumnStats = mock(Statistics.class);
-    when(normalBinaryColumnStats.getMaxBytes()).thenReturn(new byte[2]);
-    when(normalBinaryColumnStats.getMinBytes()).thenReturn(new byte[2]);
-    when(normalBinaryColumnStats.getNumNulls()).thenReturn(50L);
-    Assert.assertFalse(
-        
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats, 
50L));
+    
Assert.assertTrue(ParquetMetricsRowGroupFilter.minMaxUndefined(cdhBinaryColumnStats));
   }
 }

Reply via email to