HIVE-16068: BloomFilter expectedEntries not always using NDV when it's 
available during runtime filtering (Jason Dere, reviewed by Gunther Hagleitner)

Change-Id: I9a5f3960e1b7fc1416f510aefbeab6f28bbafffa


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/43906f9c
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/43906f9c
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/43906f9c

Branch: refs/heads/branch-2.2
Commit: 43906f9c3dbe2e63ffc744b69925b753a119e8be
Parents: 72ca985
Author: Jason Dere <jd...@hortonworks.com>
Authored: Fri Mar 3 11:53:02 2017 -0800
Committer: Owen O'Malley <omal...@apache.org>
Committed: Tue Mar 28 15:27:57 2017 -0700

----------------------------------------------------------------------
 .../hive/ql/udf/generic/GenericUDAFBloomFilter.java | 16 +++++++++++-----
 .../results/clientpositive/llap/mergejoin.q.out     |  4 ++--
 .../clientpositive/tez/explainanalyze_3.q.out       |  4 ++--
 3 files changed, 15 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/43906f9c/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java 
b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
index b32e04a..788aace 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
@@ -20,10 +20,13 @@ package org.apache.hadoop.hive.ql.udf.generic;
 
 import org.apache.hadoop.hive.common.type.HiveDecimal;
 import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.parse.SemanticException;
 import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
 import org.apache.hadoop.hive.ql.plan.Statistics;
 import org.apache.hadoop.hive.ql.plan.Statistics.State;
 import org.apache.hadoop.hive.serde2.io.DateWritable;
@@ -68,7 +71,7 @@ public class GenericUDAFBloomFilter implements 
GenericUDAFResolver2 {
    */
   public static class GenericUDAFBloomFilterEvaluator extends 
GenericUDAFEvaluator {
     // Source operator to get the number of entries
-    private Operator<?> sourceOperator;
+    private SelectOperator sourceOperator;
     private long maxEntries = 0;
 
     // ObjectInspector for input data.
@@ -258,10 +261,13 @@ public class GenericUDAFBloomFilter implements 
GenericUDAFResolver2 {
         switch (stats.getColumnStatsState()) {
           case COMPLETE:
           case PARTIAL:
-            // There should only be column stats for one column, use if that 
is the case.
+            // There should only be column in sourceOperator
             List<ColStatistics> colStats = stats.getColumnStats();
-            if (colStats.size() == 1) {
-              long ndv = colStats.get(0).getCountDistint();
+            ExprNodeColumnDesc colExpr = ExprNodeDescUtils.getColumnExpr(
+                sourceOperator.getConf().getColList().get(0));
+            if (colExpr != null
+                && stats.getColumnStatisticsFromColName(colExpr.getColumn()) 
!= null) {
+              long ndv = 
stats.getColumnStatisticsFromColName(colExpr.getColumn()).getCountDistint();
               if (ndv > 0) {
                 expectedEntries = ndv;
               }
@@ -279,7 +285,7 @@ public class GenericUDAFBloomFilter implements 
GenericUDAFResolver2 {
       return sourceOperator;
     }
 
-    public void setSourceOperator(Operator<?> sourceOperator) {
+    public void setSourceOperator(SelectOperator sourceOperator) {
       this.sourceOperator = sourceOperator;
     }
 

http://git-wip-us.apache.org/repos/asf/hive/blob/43906f9c/ql/src/test/results/clientpositive/llap/mergejoin.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/mergejoin.q.out 
b/ql/src/test/results/clientpositive/llap/mergejoin.q.out
index 85f14a0..4e6e2b4 100644
--- a/ql/src/test/results/clientpositive/llap/mergejoin.q.out
+++ b/ql/src/test/results/clientpositive/llap/mergejoin.q.out
@@ -63,7 +63,7 @@ STAGE PLANS:
                         outputColumnNames: _col0
                         Statistics: Num rows: 25 Data size: 191 Basic stats: 
COMPLETE Column stats: NONE
                         Group By Operator
-                          aggregations: min(_col0), max(_col0), 
bloom_filter(_col0, expectedEntries=25)
+                          aggregations: min(_col0), max(_col0), 
bloom_filter(_col0, expectedEntries=14)
                           mode: hash
                           outputColumnNames: _col0, _col1, _col2
                           Statistics: Num rows: 1 Data size: 252 Basic stats: 
COMPLETE Column stats: NONE
@@ -95,7 +95,7 @@ STAGE PLANS:
             Execution mode: vectorized, llap
             Reduce Operator Tree:
               Group By Operator
-                aggregations: min(VALUE._col0), max(VALUE._col1), 
bloom_filter(VALUE._col2, expectedEntries=25)
+                aggregations: min(VALUE._col0), max(VALUE._col1), 
bloom_filter(VALUE._col2, expectedEntries=14)
                 mode: final
                 outputColumnNames: _col0, _col1, _col2
                 Statistics: Num rows: 1 Data size: 252 Basic stats: COMPLETE 
Column stats: NONE

http://git-wip-us.apache.org/repos/asf/hive/blob/43906f9c/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out 
b/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out
index dfe163e..14fac5f 100644
--- a/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out
+++ b/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out
@@ -869,10 +869,10 @@ STAGE PLANS:
             Map Operator Tree:
                 TableScan
                   alias: b
-                  filterExpr: (key is not null and key BETWEEN 
DynamicValue(RS_6_a_key_min) AND DynamicValue(RS_6_a_key_max) and 
in_bloom_filter(key, DynamicValue(RS_6_a_key_bloom_filter))) (type: boolean)
+                  filterExpr: (key is not null and (key BETWEEN 
DynamicValue(RS_6_a_key_min) AND DynamicValue(RS_6_a_key_max) and 
in_bloom_filter(key, DynamicValue(RS_6_a_key_bloom_filter)))) (type: boolean)
                   Statistics: Num rows: 500/500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
                   Filter Operator
-                    predicate: (key is not null and key BETWEEN 
DynamicValue(RS_6_a_key_min) AND DynamicValue(RS_6_a_key_max) and 
in_bloom_filter(key, DynamicValue(RS_6_a_key_bloom_filter))) (type: boolean)
+                    predicate: (key is not null and (key BETWEEN 
DynamicValue(RS_6_a_key_min) AND DynamicValue(RS_6_a_key_max) and 
in_bloom_filter(key, DynamicValue(RS_6_a_key_bloom_filter)))) (type: boolean)
                     Statistics: Num rows: 500/244 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
                     Select Operator
                       expressions: key (type: int), value (type: string)

Reply via email to