Repository: hive Updated Branches: refs/heads/master 1db3debcb -> 213efd70b
HIVE-20660: Group by statistics estimation could be improved by bounding the total number of rows to source table (Vineet Garg, reviewed by Ashutosh Chauhan) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/213efd70 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/213efd70 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/213efd70 Branch: refs/heads/master Commit: 213efd70b389dc0a122ec8501888239446a22d1e Parents: 1db3deb Author: Vineet Garg <vg...@apache.org> Authored: Sun Oct 14 20:35:37 2018 -0700 Committer: Vineet Garg <vg...@apache.org> Committed: Sun Oct 14 20:35:37 2018 -0700 ---------------------------------------------------------------------- .../hadoop/hive/ql/exec/OperatorUtils.java | 91 +++++++++ .../stats/annotation/StatsRulesProcFactory.java | 17 +- .../clientpositive/annotate_stats_groupby.q | 17 ++ .../clientpositive/annotate_stats_groupby.q.out | 190 +++++++++++++++++++ 4 files changed, 313 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/213efd70/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java index 456786c..f0b41f3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/OperatorUtils.java @@ -29,9 +29,13 @@ import java.util.Set; import java.util.Stack; import org.apache.hadoop.hive.ql.exec.NodeUtils.Function; +import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo; import org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator; import org.apache.hadoop.hive.ql.plan.BaseWork; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; @@ -501,4 +505,91 @@ public class OperatorUtils { } return; } + + private static List<ExprNodeDesc> backtrackAll(List<ExprNodeDesc> exprs, Operator<? extends OperatorDesc> start, + Operator<? extends OperatorDesc> terminal) { + List<ExprNodeDesc> backtrackedExprs = new ArrayList<>(); + try { + for (ExprNodeDesc expr : exprs) { + ExprNodeDesc backtrackedExpr = ExprNodeDescUtils.backtrack(expr, start, terminal); + if(backtrackedExpr == null) { + return null; + } + backtrackedExprs.add(backtrackedExpr); + + } + } catch (SemanticException e) { + return null; + } + return backtrackedExprs; + } + + // set of expressions are considered compatible if following are true: + // * they are both same size + // * if the are column expressions their table alias is same as well (this is checked because otherwise + // expressions coming out of multiple RS (e.g. children of JOIN) are ended up same + private static boolean areBacktrackedExprsCompatible(final List<ExprNodeDesc> orgexprs, + final List<ExprNodeDesc> backtrackedExprs) { + if(backtrackedExprs == null || backtrackedExprs.size() != orgexprs.size()) { + return false; + } + for(int i=0; i<orgexprs.size(); i++) { + if(orgexprs.get(i) instanceof ExprNodeColumnDesc && backtrackedExprs.get(i) instanceof ExprNodeColumnDesc) { + ExprNodeColumnDesc orgColExpr = (ExprNodeColumnDesc)orgexprs.get(i); + ExprNodeColumnDesc backExpr = (ExprNodeColumnDesc)backtrackedExprs.get(i); + String orgTabAlias = orgColExpr.getTabAlias(); + String backTabAlias = backExpr.getTabAlias(); + + if(orgTabAlias != null && backTabAlias != null && !orgTabAlias.equals(backTabAlias)) { + return false; + } + } + } + return true; + } + + /*** + * This method backtracks the given expressions to the source RS. Note that expressions could + * further be backtracked to e.g. table source, but we are interested in RS only because this + * is used to estimate number of rows for group by and estimation will be better at RS since all + * the filters etc will have already been applied + * @param start + * @param exprs + * @return null if RS is not found + */ + public static Operator<? extends OperatorDesc> findSourceRS(Operator<?> start, List<ExprNodeDesc> exprs) { + Operator currRS = null; //keep track of the RS + if (start instanceof ReduceSinkOperator) { + currRS = start; + } + + if (start instanceof UnionOperator) { + //Union keeps the schema same but can change the cardinality, therefore we don't want to backtrack further + // into Union + return currRS; + } + + List<Operator<? extends OperatorDesc>> parents = start.getParentOperators(); + if (parents == null | parents.isEmpty()) { + // reached end e.g. TS operator + return null; + } + + Operator<? extends OperatorDesc> nextOp = null; + List<ExprNodeDesc> backtrackedExprs = null; + for (int i = 0; i < parents.size(); i++) { + backtrackedExprs = backtrackAll(exprs, start, parents.get(i)); + if (areBacktrackedExprsCompatible(exprs, backtrackedExprs)) { + nextOp = parents.get(i); + break; + } + } + if (nextOp != null) { + Operator<? extends OperatorDesc> nextRS = findSourceRS(nextOp, backtrackedExprs); + if (nextRS != null) { + currRS = nextRS; + } + } + return currRS; + } } http://git-wip-us.apache.org/repos/asf/hive/blob/213efd70/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 1da6d52..32fba6c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -1381,7 +1381,9 @@ public class StatsRulesProcFactory { } } else { // Case 3: column stats, hash aggregation, NO grouping sets - cardinality = Math.min(parentNumRows / 2, StatsUtils.safeMult(ndvProduct, parallelism)); + cardinality = Math.min(parentNumRows/2, StatsUtils.safeMult(ndvProduct, parallelism)); + long orgParentNumRows = getParentNumRows(gop, gop.getConf().getKeys(), conf); + cardinality = Math.min(cardinality, orgParentNumRows); if (LOG.isDebugEnabled()) { LOG.debug("[Case 3] STATS-" + gop.toString() + ": cardinality: " + cardinality); @@ -1397,7 +1399,7 @@ public class StatsRulesProcFactory { } } else { // Case 5: column stats, NO hash aggregation, NO grouping sets - cardinality = parentNumRows; + cardinality = Math.min(parentNumRows, getParentNumRows(gop, gop.getConf().getKeys(), conf)); if (LOG.isDebugEnabled()) { LOG.debug("[Case 5] STATS-" + gop.toString() + ": cardinality: " + cardinality); @@ -1521,6 +1523,17 @@ public class StatsRulesProcFactory { return null; } + private long getParentNumRows(GroupByOperator op, List<ExprNodeDesc> gbyKeys, HiveConf conf) { + if(gbyKeys == null || gbyKeys.isEmpty()) { + return op.getParentOperators().get(0).getStatistics().getNumRows(); + } + Operator<? extends OperatorDesc> parent = OperatorUtils.findSourceRS(op, gbyKeys); + if(parent != null) { + return parent.getStatistics().getNumRows(); + } + return op.getParentOperators().get(0).getStatistics().getNumRows(); + } + /** * This method does not take into account many configs used at runtime to * disable hash aggregation like HIVEMAPAGGRHASHMINREDUCTION. This method http://git-wip-us.apache.org/repos/asf/hive/blob/213efd70/ql/src/test/queries/clientpositive/annotate_stats_groupby.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/annotate_stats_groupby.q b/ql/src/test/queries/clientpositive/annotate_stats_groupby.q index 081f057..c727671 100644 --- a/ql/src/test/queries/clientpositive/annotate_stats_groupby.q +++ b/ql/src/test/queries/clientpositive/annotate_stats_groupby.q @@ -140,3 +140,20 @@ explain select year from loc_orc_n2 group by year; -- Case 7: NO column stats - cardinality = 16 explain select state,locid from loc_orc_n2 group by state,locid with cube; +set hive.stats.fetch.column.stats=true; + +create table t1_uq12(i int, j int); +alter table t1_uq12 update statistics set('numRows'='10000', 'rawDataSize'='18000'); +alter table t1_uq12 update statistics for column i set('numDVs'='2500','numNulls'='50','highValue'='1000','lowValue'='0'); +alter table t1_uq12 update statistics for column j set('numDVs'='500','numNulls'='30','highValue'='100','lowValue'='50'); + +create table t2_uq12(i2 int, j2 int); +alter table t2_uq12 update statistics set('numRows'='100000000', 'rawDataSize'='10000'); +alter table t2_uq12 update statistics for column i2 set('numDVs'='10000000','numNulls'='0','highValue'='8000','lowValue'='0'); +alter table t2_uq12 update statistics for column j2 set('numDVs'='10','numNulls'='0','highValue'='800','lowValue'='-1'); + +explain select count (1) from t1_uq12,t2_uq12 where t1_uq12.j=t2_uq12.i2 group by t1_uq12.i, t1_uq12.j; + +drop table t1_uq12; +drop table t2_uq12; + http://git-wip-us.apache.org/repos/asf/hive/blob/213efd70/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out b/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out index 932e208..fe30d31 100644 --- a/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out +++ b/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out @@ -1346,3 +1346,193 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: create table t1_uq12(i int, j int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t1_uq12 +POSTHOOK: query: create table t1_uq12(i int, j int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1_uq12 +PREHOOK: query: alter table t1_uq12 update statistics set('numRows'='10000', 'rawDataSize'='18000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t1_uq12 +PREHOOK: Output: default@t1_uq12 +POSTHOOK: query: alter table t1_uq12 update statistics set('numRows'='10000', 'rawDataSize'='18000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t1_uq12 +POSTHOOK: Output: default@t1_uq12 +PREHOOK: query: alter table t1_uq12 update statistics for column i set('numDVs'='2500','numNulls'='50','highValue'='1000','lowValue'='0') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t1_uq12 +PREHOOK: Output: default@t1_uq12 +POSTHOOK: query: alter table t1_uq12 update statistics for column i set('numDVs'='2500','numNulls'='50','highValue'='1000','lowValue'='0') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t1_uq12 +POSTHOOK: Output: default@t1_uq12 +PREHOOK: query: alter table t1_uq12 update statistics for column j set('numDVs'='500','numNulls'='30','highValue'='100','lowValue'='50') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t1_uq12 +PREHOOK: Output: default@t1_uq12 +POSTHOOK: query: alter table t1_uq12 update statistics for column j set('numDVs'='500','numNulls'='30','highValue'='100','lowValue'='50') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t1_uq12 +POSTHOOK: Output: default@t1_uq12 +PREHOOK: query: create table t2_uq12(i2 int, j2 int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t2_uq12 +POSTHOOK: query: create table t2_uq12(i2 int, j2 int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t2_uq12 +PREHOOK: query: alter table t2_uq12 update statistics set('numRows'='100000000', 'rawDataSize'='10000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t2_uq12 +PREHOOK: Output: default@t2_uq12 +POSTHOOK: query: alter table t2_uq12 update statistics set('numRows'='100000000', 'rawDataSize'='10000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t2_uq12 +POSTHOOK: Output: default@t2_uq12 +PREHOOK: query: alter table t2_uq12 update statistics for column i2 set('numDVs'='10000000','numNulls'='0','highValue'='8000','lowValue'='0') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t2_uq12 +PREHOOK: Output: default@t2_uq12 +POSTHOOK: query: alter table t2_uq12 update statistics for column i2 set('numDVs'='10000000','numNulls'='0','highValue'='8000','lowValue'='0') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t2_uq12 +POSTHOOK: Output: default@t2_uq12 +PREHOOK: query: alter table t2_uq12 update statistics for column j2 set('numDVs'='10','numNulls'='0','highValue'='800','lowValue'='-1') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t2_uq12 +PREHOOK: Output: default@t2_uq12 +POSTHOOK: query: alter table t2_uq12 update statistics for column j2 set('numDVs'='10','numNulls'='0','highValue'='800','lowValue'='-1') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t2_uq12 +POSTHOOK: Output: default@t2_uq12 +PREHOOK: query: explain select count (1) from t1_uq12,t2_uq12 where t1_uq12.j=t2_uq12.i2 group by t1_uq12.i, t1_uq12.j +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_uq12 +PREHOOK: Input: default@t2_uq12 +#### A masked pattern was here #### +POSTHOOK: query: explain select count (1) from t1_uq12,t2_uq12 where t1_uq12.j=t2_uq12.i2 group by t1_uq12.i, t1_uq12.j +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_uq12 +POSTHOOK: Input: default@t2_uq12 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: t1_uq12 + filterExpr: j is not null (type: boolean) + Statistics: Num rows: 10000 Data size: 79688 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: j is not null (type: boolean) + Statistics: Num rows: 9970 Data size: 79448 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: i (type: int), j (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 9970 Data size: 79448 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Map-reduce partition columns: _col1 (type: int) + Statistics: Num rows: 9970 Data size: 79448 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int) + TableScan + alias: t2_uq12 + filterExpr: i2 is not null (type: boolean) + Statistics: Num rows: 100000000 Data size: 400000000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: i2 is not null (type: boolean) + Statistics: Num rows: 100000000 Data size: 400000000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: i2 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 100000000 Data size: 400000000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 100000000 Data size: 400000000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col1 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 99700 Data size: 797288 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + keys: _col0 (type: int), _col1 (type: int) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 9970 Data size: 159496 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: int), _col1 (type: int) + Statistics: Num rows: 9970 Data size: 159496 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col2 (type: bigint) + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: int), KEY._col1 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 9970 Data size: 159496 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col2 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 9970 Data size: 79760 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 9970 Data size: 79760 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: drop table t1_uq12 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t1_uq12 +PREHOOK: Output: default@t1_uq12 +POSTHOOK: query: drop table t1_uq12 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t1_uq12 +POSTHOOK: Output: default@t1_uq12 +PREHOOK: query: drop table t2_uq12 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t2_uq12 +PREHOOK: Output: default@t2_uq12 +POSTHOOK: query: drop table t2_uq12 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t2_uq12 +POSTHOOK: Output: default@t2_uq12