HIVE-16068: BloomFilter expectedEntries not always using NDV when it's available during runtime filtering (Jason Dere, reviewed by Gunther Hagleitner)
Change-Id: I9a5f3960e1b7fc1416f510aefbeab6f28bbafffa Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/43906f9c Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/43906f9c Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/43906f9c Branch: refs/heads/branch-2.2 Commit: 43906f9c3dbe2e63ffc744b69925b753a119e8be Parents: 72ca985 Author: Jason Dere <jd...@hortonworks.com> Authored: Fri Mar 3 11:53:02 2017 -0800 Committer: Owen O'Malley <omal...@apache.org> Committed: Tue Mar 28 15:27:57 2017 -0700 ---------------------------------------------------------------------- .../hive/ql/udf/generic/GenericUDAFBloomFilter.java | 16 +++++++++++----- .../results/clientpositive/llap/mergejoin.q.out | 4 ++-- .../clientpositive/tez/explainanalyze_3.q.out | 4 ++-- 3 files changed, 15 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/43906f9c/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java index b32e04a..788aace 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java @@ -20,10 +20,13 @@ package org.apache.hadoop.hive.ql.udf.generic; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.Statistics.State; import org.apache.hadoop.hive.serde2.io.DateWritable; @@ -68,7 +71,7 @@ public class GenericUDAFBloomFilter implements GenericUDAFResolver2 { */ public static class GenericUDAFBloomFilterEvaluator extends GenericUDAFEvaluator { // Source operator to get the number of entries - private Operator<?> sourceOperator; + private SelectOperator sourceOperator; private long maxEntries = 0; // ObjectInspector for input data. @@ -258,10 +261,13 @@ public class GenericUDAFBloomFilter implements GenericUDAFResolver2 { switch (stats.getColumnStatsState()) { case COMPLETE: case PARTIAL: - // There should only be column stats for one column, use if that is the case. + // There should only be column in sourceOperator List<ColStatistics> colStats = stats.getColumnStats(); - if (colStats.size() == 1) { - long ndv = colStats.get(0).getCountDistint(); + ExprNodeColumnDesc colExpr = ExprNodeDescUtils.getColumnExpr( + sourceOperator.getConf().getColList().get(0)); + if (colExpr != null + && stats.getColumnStatisticsFromColName(colExpr.getColumn()) != null) { + long ndv = stats.getColumnStatisticsFromColName(colExpr.getColumn()).getCountDistint(); if (ndv > 0) { expectedEntries = ndv; } @@ -279,7 +285,7 @@ public class GenericUDAFBloomFilter implements GenericUDAFResolver2 { return sourceOperator; } - public void setSourceOperator(Operator<?> sourceOperator) { + public void setSourceOperator(SelectOperator sourceOperator) { this.sourceOperator = sourceOperator; } http://git-wip-us.apache.org/repos/asf/hive/blob/43906f9c/ql/src/test/results/clientpositive/llap/mergejoin.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/mergejoin.q.out b/ql/src/test/results/clientpositive/llap/mergejoin.q.out index 85f14a0..4e6e2b4 100644 --- a/ql/src/test/results/clientpositive/llap/mergejoin.q.out +++ b/ql/src/test/results/clientpositive/llap/mergejoin.q.out @@ -63,7 +63,7 @@ STAGE PLANS: outputColumnNames: _col0 Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=25) + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=14) mode: hash outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 252 Basic stats: COMPLETE Column stats: NONE @@ -95,7 +95,7 @@ STAGE PLANS: Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator - aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=25) + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=14) mode: final outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 252 Basic stats: COMPLETE Column stats: NONE http://git-wip-us.apache.org/repos/asf/hive/blob/43906f9c/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out b/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out index dfe163e..14fac5f 100644 --- a/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out +++ b/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out @@ -869,10 +869,10 @@ STAGE PLANS: Map Operator Tree: TableScan alias: b - filterExpr: (key is not null and key BETWEEN DynamicValue(RS_6_a_key_min) AND DynamicValue(RS_6_a_key_max) and in_bloom_filter(key, DynamicValue(RS_6_a_key_bloom_filter))) (type: boolean) + filterExpr: (key is not null and (key BETWEEN DynamicValue(RS_6_a_key_min) AND DynamicValue(RS_6_a_key_max) and in_bloom_filter(key, DynamicValue(RS_6_a_key_bloom_filter)))) (type: boolean) Statistics: Num rows: 500/500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (key is not null and key BETWEEN DynamicValue(RS_6_a_key_min) AND DynamicValue(RS_6_a_key_max) and in_bloom_filter(key, DynamicValue(RS_6_a_key_bloom_filter))) (type: boolean) + predicate: (key is not null and (key BETWEEN DynamicValue(RS_6_a_key_min) AND DynamicValue(RS_6_a_key_max) and in_bloom_filter(key, DynamicValue(RS_6_a_key_bloom_filter)))) (type: boolean) Statistics: Num rows: 500/244 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: int), value (type: string)