PARQUET-645: Fix null handling in DictionaryFilter.

This fixes how null is handled by `DictionaryFilter` for equals predicates. 
Null is never in the dictionary and is encoded by the definition level, so the 
`DictionaryFilter` would never find the value in the dictionary and would 
incorrectly filter row groups whenever the filter was `col == null`.

Author: Ryan Blue <b...@apache.org>

Closes #348 from rdblue/PARQUET-645-fix-null-dictionary-filter and squashes the 
following commits:

ae8dd41 [Ryan Blue] PARQUET-645: Fix null handling in DictionaryFilter.


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/35cf1b4d
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/35cf1b4d
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/35cf1b4d

Branch: refs/heads/parquet-1.8.x
Commit: 35cf1b4d4cb1c0e1fb70afae369ee73244c7dada
Parents: 4240cb9
Author: Ryan Blue <b...@apache.org>
Authored: Thu Jun 30 09:47:48 2016 -0700
Committer: Ryan Blue <b...@apache.org>
Committed: Mon Jan 9 16:54:54 2017 -0800

----------------------------------------------------------------------
 .../filter2/dictionarylevel/DictionaryFilter.java       | 12 ++++++++++++
 .../filter2/dictionarylevel/DictionaryFilterTest.java   |  6 ++++++
 2 files changed, 18 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/35cf1b4d/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
----------------------------------------------------------------------
diff --git 
a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
 
b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
index 9b03f82..dc1d649 100644
--- 
a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
+++ 
b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
@@ -123,6 +123,12 @@ public class DictionaryFilter implements 
FilterPredicate.Visitor<Boolean> {
 
     filterColumn.getColumnPath();
 
+    if (value == null) {
+      // the dictionary contains only non-null values so isn't helpful. this
+      // could check the column stats, but the StatisticsFilter is responsible
+      return BLOCK_MIGHT_MATCH;
+    }
+
     try {
       Set<T> dictSet = expandDictionary(meta);
       if (dictSet != null && !dictSet.contains(value)) {
@@ -150,6 +156,12 @@ public class DictionaryFilter implements 
FilterPredicate.Visitor<Boolean> {
 
     filterColumn.getColumnPath();
 
+    if (value == null) {
+      // the dictionary contains only non-null values so isn't helpful. this
+      // could check the column stats, but the StatisticsFilter is responsible
+      return BLOCK_MIGHT_MATCH;
+    }
+
     try {
       Set<T> dictSet = expandDictionary(meta);
       if (dictSet != null && dictSet.size() == 1 && dictSet.contains(value)) {

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/35cf1b4d/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
----------------------------------------------------------------------
diff --git 
a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
 
b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
index 754da68..35b944d 100644
--- 
a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
+++ 
b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
@@ -193,6 +193,9 @@ public class DictionaryFilterTest {
 
     assertTrue("Should drop block for upper case letters",
         canDrop(eq(b, Binary.fromString("A")), ccmd, dictionaries));
+
+    assertFalse("Should not drop block for null",
+        canDrop(eq(b, null), ccmd, dictionaries));
   }
 
   @Test
@@ -211,6 +214,9 @@ public class DictionaryFilterTest {
 
     assertFalse("Should not drop block with a known value",
         canDrop(notEq(b, Binary.fromString("B")), ccmd, dictionaries));
+
+    assertFalse("Should not drop block for null",
+        canDrop(notEq(b, null), ccmd, dictionaries));
   }
 
   @Test

Reply via email to