This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new 73b70e3824 Parquet: Fix ParquetDictionaryRowGroupFilter evaluating NaN 
(#6431)
73b70e3824 is described below

commit 73b70e3824f049e3e3808e8255a8188a924daf64
Author: Yujiang Zhong <[email protected]>
AuthorDate: Wed Jan 4 02:46:13 2023 +0800

    Parquet: Fix ParquetDictionaryRowGroupFilter evaluating NaN (#6431)
---
 .../parquet/ParquetDictionaryRowGroupFilter.java   |  4 +++
 .../parquet/TestDictionaryRowGroupFilter.java      | 37 +++++++++++++++++++---
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git 
a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java
 
b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java
index 88339e2111..1e06eac1c0 100644
--- 
a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java
+++ 
b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java
@@ -172,6 +172,10 @@ public class ParquetDictionaryRowGroupFilter {
     public <T> Boolean notNaN(BoundReference<T> ref) {
       int id = ref.fieldId();
 
+      if (mayContainNulls.get(id)) {
+        return ROWS_MIGHT_MATCH;
+      }
+
       Boolean hasNonDictPage = isFallback.get(id);
       if (hasNonDictPage == null || hasNonDictPage) {
         return ROWS_MIGHT_MATCH;
diff --git 
a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java
 
b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java
index f2e0863965..eafa93af1c 100644
--- 
a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java
+++ 
b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java
@@ -109,8 +109,8 @@ public class TestDictionaryRowGroupFilter {
           optional(
               14,
               "decimal_fixed",
-              DecimalType.of(20, 10)) // >18 precision to enforce 
FIXED_LEN_BYTE_ARRAY
-          );
+              DecimalType.of(20, 10)), // >18 precision to enforce 
FIXED_LEN_BYTE_ARRAY
+          optional(15, "_nans_and_nulls", DoubleType.get()));
 
   private static final Types.StructType _structFieldType =
       Types.StructType.of(Types.NestedField.required(9, "_int_field", 
IntegerType.get()));
@@ -131,8 +131,8 @@ public class TestDictionaryRowGroupFilter {
           optional(
               14,
               "_decimal_fixed",
-              DecimalType.of(20, 10)) // >18 precision to enforce 
FIXED_LEN_BYTE_ARRAY
-          );
+              DecimalType.of(20, 10)), // >18 precision to enforce 
FIXED_LEN_BYTE_ARRAY
+          optional(15, "_nans_and_nulls", DoubleType.get()));
 
   private static final String TOO_LONG_FOR_STATS;
 
@@ -201,6 +201,8 @@ public class TestDictionaryRowGroupFilter {
           builder.set(
               "_decimal_fixed", 
DECIMAL_MIN_VALUE.add(DECIMAL_STEP.multiply(new BigDecimal(i))));
 
+          builder.set("_nans_and_nulls", (i % 10 == 0) ? null : Double.NaN); 
// only nans and nulls
+
           Record structNotNull = new Record(structSchema);
           structNotNull.put("_int_field", INT_MIN_VALUE + i);
           builder.set("_struct_not_null", structNotNull); // struct with int
@@ -360,6 +362,33 @@ public class TestDictionaryRowGroupFilter {
     Assert.assertTrue("Should read: no_nans column will contain non-NaN", 
shouldRead);
   }
 
+  @Test
+  public void testNotNaNOnNaNsAndNulls() {
+    boolean shouldRead =
+        new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("_nans_and_nulls"))
+            .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+    Assert.assertTrue("Should read: _nans_and_nulls column will contain null 
values", shouldRead);
+
+    shouldRead =
+        new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("_nans_and_nulls"))
+            .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+    Assert.assertTrue(
+        "Should read: _nans_and_nulls column will contain NaN values which are 
not null",
+        shouldRead);
+
+    shouldRead =
+        new ParquetDictionaryRowGroupFilter(SCHEMA, isNaN("_nans_and_nulls"))
+            .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+    Assert.assertTrue("Should read: _nans_and_nulls column will contain NaN 
values", shouldRead);
+
+    shouldRead =
+        new ParquetDictionaryRowGroupFilter(SCHEMA, notNaN("_nans_and_nulls"))
+            .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+    Assert.assertTrue(
+        "Should read: _nans_and_nulls column will contain null values which 
are not NaN",
+        shouldRead);
+  }
+
   @Test
   public void testStartsWith() {
     boolean shouldRead =

Reply via email to