This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 73b70e3824 Parquet: Fix ParquetDictionaryRowGroupFilter evaluating NaN
(#6431)
73b70e3824 is described below
commit 73b70e3824f049e3e3808e8255a8188a924daf64
Author: Yujiang Zhong <[email protected]>
AuthorDate: Wed Jan 4 02:46:13 2023 +0800
Parquet: Fix ParquetDictionaryRowGroupFilter evaluating NaN (#6431)
---
.../parquet/ParquetDictionaryRowGroupFilter.java | 4 +++
.../parquet/TestDictionaryRowGroupFilter.java | 37 +++++++++++++++++++---
2 files changed, 37 insertions(+), 4 deletions(-)
diff --git
a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java
b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java
index 88339e2111..1e06eac1c0 100644
---
a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java
+++
b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java
@@ -172,6 +172,10 @@ public class ParquetDictionaryRowGroupFilter {
public <T> Boolean notNaN(BoundReference<T> ref) {
int id = ref.fieldId();
+ if (mayContainNulls.get(id)) {
+ return ROWS_MIGHT_MATCH;
+ }
+
Boolean hasNonDictPage = isFallback.get(id);
if (hasNonDictPage == null || hasNonDictPage) {
return ROWS_MIGHT_MATCH;
diff --git
a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java
b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java
index f2e0863965..eafa93af1c 100644
---
a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java
+++
b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java
@@ -109,8 +109,8 @@ public class TestDictionaryRowGroupFilter {
optional(
14,
"decimal_fixed",
- DecimalType.of(20, 10)) // >18 precision to enforce
FIXED_LEN_BYTE_ARRAY
- );
+ DecimalType.of(20, 10)), // >18 precision to enforce
FIXED_LEN_BYTE_ARRAY
+ optional(15, "_nans_and_nulls", DoubleType.get()));
private static final Types.StructType _structFieldType =
Types.StructType.of(Types.NestedField.required(9, "_int_field",
IntegerType.get()));
@@ -131,8 +131,8 @@ public class TestDictionaryRowGroupFilter {
optional(
14,
"_decimal_fixed",
- DecimalType.of(20, 10)) // >18 precision to enforce
FIXED_LEN_BYTE_ARRAY
- );
+ DecimalType.of(20, 10)), // >18 precision to enforce
FIXED_LEN_BYTE_ARRAY
+ optional(15, "_nans_and_nulls", DoubleType.get()));
private static final String TOO_LONG_FOR_STATS;
@@ -201,6 +201,8 @@ public class TestDictionaryRowGroupFilter {
builder.set(
"_decimal_fixed",
DECIMAL_MIN_VALUE.add(DECIMAL_STEP.multiply(new BigDecimal(i))));
+ builder.set("_nans_and_nulls", (i % 10 == 0) ? null : Double.NaN);
// only nans and nulls
+
Record structNotNull = new Record(structSchema);
structNotNull.put("_int_field", INT_MIN_VALUE + i);
builder.set("_struct_not_null", structNotNull); // struct with int
@@ -360,6 +362,33 @@ public class TestDictionaryRowGroupFilter {
Assert.assertTrue("Should read: no_nans column will contain non-NaN",
shouldRead);
}
+ @Test
+ public void testNotNaNOnNaNsAndNulls() {
+ boolean shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("_nans_and_nulls"))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ Assert.assertTrue("Should read: _nans_and_nulls column will contain null
values", shouldRead);
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("_nans_and_nulls"))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ Assert.assertTrue(
+ "Should read: _nans_and_nulls column will contain NaN values which are
not null",
+ shouldRead);
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, isNaN("_nans_and_nulls"))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ Assert.assertTrue("Should read: _nans_and_nulls column will contain NaN
values", shouldRead);
+
+ shouldRead =
+ new ParquetDictionaryRowGroupFilter(SCHEMA, notNaN("_nans_and_nulls"))
+ .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
+ Assert.assertTrue(
+ "Should read: _nans_and_nulls column will contain null values which
are not NaN",
+ shouldRead);
+ }
+
@Test
public void testStartsWith() {
boolean shouldRead =