PARQUET-645: Fix null handling in DictionaryFilter. This fixes how null is handled by `DictionaryFilter` for equals predicates. Null is never in the dictionary and is encoded by the definition level, so the `DictionaryFilter` would never find the value in the dictionary and would incorrectly filter row groups whenever the filter was `col == null`.
Author: Ryan Blue <b...@apache.org> Closes #348 from rdblue/PARQUET-645-fix-null-dictionary-filter and squashes the following commits: ae8dd41 [Ryan Blue] PARQUET-645: Fix null handling in DictionaryFilter. Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/35cf1b4d Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/35cf1b4d Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/35cf1b4d Branch: refs/heads/parquet-1.8.x Commit: 35cf1b4d4cb1c0e1fb70afae369ee73244c7dada Parents: 4240cb9 Author: Ryan Blue <b...@apache.org> Authored: Thu Jun 30 09:47:48 2016 -0700 Committer: Ryan Blue <b...@apache.org> Committed: Mon Jan 9 16:54:54 2017 -0800 ---------------------------------------------------------------------- .../filter2/dictionarylevel/DictionaryFilter.java | 12 ++++++++++++ .../filter2/dictionarylevel/DictionaryFilterTest.java | 6 ++++++ 2 files changed, 18 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/35cf1b4d/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java index 9b03f82..dc1d649 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java @@ -123,6 +123,12 @@ public class DictionaryFilter implements FilterPredicate.Visitor<Boolean> { filterColumn.getColumnPath(); + if (value == null) { + // the dictionary contains only non-null values so isn't helpful. this + // could check the column stats, but the StatisticsFilter is responsible + return BLOCK_MIGHT_MATCH; + } + try { Set<T> dictSet = expandDictionary(meta); if (dictSet != null && !dictSet.contains(value)) { @@ -150,6 +156,12 @@ public class DictionaryFilter implements FilterPredicate.Visitor<Boolean> { filterColumn.getColumnPath(); + if (value == null) { + // the dictionary contains only non-null values so isn't helpful. this + // could check the column stats, but the StatisticsFilter is responsible + return BLOCK_MIGHT_MATCH; + } + try { Set<T> dictSet = expandDictionary(meta); if (dictSet != null && dictSet.size() == 1 && dictSet.contains(value)) { http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/35cf1b4d/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java index 754da68..35b944d 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java @@ -193,6 +193,9 @@ public class DictionaryFilterTest { assertTrue("Should drop block for upper case letters", canDrop(eq(b, Binary.fromString("A")), ccmd, dictionaries)); + + assertFalse("Should not drop block for null", + canDrop(eq(b, null), ccmd, dictionaries)); } @Test @@ -211,6 +214,9 @@ public class DictionaryFilterTest { assertFalse("Should not drop block with a known value", canDrop(notEq(b, Binary.fromString("B")), ccmd, dictionaries)); + + assertFalse("Should not drop block for null", + canDrop(notEq(b, null), ccmd, dictionaries)); } @Test