HIVE-11416: CBO: Calcite Operator To Hive Operator (Calcite Return Path): Groupby Optimizer assumes the schema can match after removing RS and GBY (reviewed by Jesus Camacho Rodriguez)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/763cb02b Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/763cb02b Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/763cb02b Branch: refs/heads/hbase-metastore Commit: 763cb02b5eafb0ecd3fd0eb512636a1b092df671 Parents: 57ba795 Author: Pengcheng Xiong <pxi...@apache.org> Authored: Tue Aug 11 11:26:48 2015 -0700 Committer: Pengcheng Xiong <pxi...@apache.org> Committed: Tue Aug 11 11:26:48 2015 -0700 ---------------------------------------------------------------------- .../apache/hadoop/hive/ql/exec/Operator.java | 25 --------- .../hive/ql/optimizer/GroupByOptimizer.java | 58 +++++++++++++++++++- 2 files changed, 57 insertions(+), 26 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/763cb02b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java index 0f02737..acbe504 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java @@ -769,31 +769,6 @@ public abstract class Operator<T extends OperatorDesc> implements Serializable,C } } - // Remove the operators till a certain depth. - // Return true if the remove was successful, false otherwise - public boolean removeChildren(int depth) { - Operator<? extends OperatorDesc> currOp = this; - for (int i = 0; i < depth; i++) { - // If there are more than 1 children at any level, don't do anything - if ((currOp.getChildOperators() == null) || (currOp.getChildOperators().isEmpty()) || - (currOp.getChildOperators().size() > 1)) { - return false; - } - currOp = currOp.getChildOperators().get(0); - } - - setChildOperators(currOp.getChildOperators()); - - List<Operator<? extends OperatorDesc>> parentOps = - new ArrayList<Operator<? extends OperatorDesc>>(); - parentOps.add(this); - - for (Operator<? extends OperatorDesc> op : currOp.getChildOperators()) { - op.setParentOperators(parentOps); - } - return true; - } - /** * Replace one parent with another at the same position. Chilren of the new * parent are not updated http://git-wip-us.apache.org/repos/asf/hive/blob/763cb02b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java index af54286..ce3f59a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java @@ -21,6 +21,7 @@ package org.apache.hadoop.hive.ql.optimizer; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -31,9 +32,12 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; import org.apache.hadoop.hive.ql.exec.GroupByOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorFactory; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Utilities; @@ -520,12 +524,64 @@ public class GroupByOptimizer implements Transform { return; } - if (groupByOp.removeChildren(depth)) { + if (removeChildren(groupByOp, depth)) { // Use bucketized hive input format - that makes sure that one mapper reads the entire file groupByOp.setUseBucketizedHiveInputFormat(true); groupByOp.getConf().setMode(GroupByDesc.Mode.FINAL); } } + + // Remove the operators till a certain depth. + // Return true if the remove was successful, false otherwise + public boolean removeChildren(Operator<? extends OperatorDesc> currOp, int depth) { + Operator<? extends OperatorDesc> inputOp = currOp; + for (int i = 0; i < depth; i++) { + // If there are more than 1 children at any level, don't do anything + if ((currOp.getChildOperators() == null) || (currOp.getChildOperators().isEmpty()) + || (currOp.getChildOperators().size() > 1)) { + return false; + } + currOp = currOp.getChildOperators().get(0); + } + + // add selectOp to match the schema + // after that, inputOp is the parent of selOp. + for (Operator<? extends OperatorDesc> op : inputOp.getChildOperators()) { + op.getParentOperators().clear(); + } + inputOp.getChildOperators().clear(); + Operator<? extends OperatorDesc> selOp = genOutputSelectForGroupBy(inputOp, currOp); + + // update the childOp of selectOp + selOp.setChildOperators(currOp.getChildOperators()); + + // update the parentOp + for (Operator<? extends OperatorDesc> op : currOp.getChildOperators()) { + op.replaceParent(currOp, selOp); + } + return true; + } + + private Operator<? extends OperatorDesc> genOutputSelectForGroupBy( + Operator<? extends OperatorDesc> parentOp, Operator<? extends OperatorDesc> currOp) { + assert (parentOp.getSchema().getSignature().size() == currOp.getSchema().getSignature().size()); + Iterator<ColumnInfo> pIter = parentOp.getSchema().getSignature().iterator(); + Iterator<ColumnInfo> cIter = currOp.getSchema().getSignature().iterator(); + List<ExprNodeDesc> columns = new ArrayList<ExprNodeDesc>(); + List<String> colName = new ArrayList<String>(); + Map<String, ExprNodeDesc> columnExprMap = new HashMap<String, ExprNodeDesc>(); + while (pIter.hasNext()) { + ColumnInfo pInfo = pIter.next(); + ColumnInfo cInfo = cIter.next(); + ExprNodeDesc column = new ExprNodeColumnDesc(pInfo.getType(), pInfo.getInternalName(), + pInfo.getTabAlias(), pInfo.getIsVirtualCol(), pInfo.isSkewedCol()); + columns.add(column); + colName.add(cInfo.getInternalName()); + columnExprMap.put(cInfo.getInternalName(), column); + } + return OperatorFactory.getAndMakeChild(new SelectDesc(columns, colName), new RowSchema(currOp + .getSchema().getSignature()), columnExprMap, parentOp); + } } /**