This is an automated email from the ASF dual-hosted git repository. kgyrtkirk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 50f54d2 HIVE-25528: Avoid recalculating types after CBO on second AST pass (#2722) (Stephen Carlin reviewed by Alessandro Solimando, Zoltan Haindrich) 50f54d2 is described below commit 50f54d24d4fc367ced045b9b8f25b2a0e358634e Author: scarlin-cloudera <55709772+scarlin-cloud...@users.noreply.github.com> AuthorDate: Wed Oct 27 10:58:20 2021 -0700 HIVE-25528: Avoid recalculating types after CBO on second AST pass (#2722) (Stephen Carlin reviewed by Alessandro Solimando, Zoltan Haindrich) At compile time, Hive parses the query into ASTNodes. For CBO, the ASTNodes get converted into Calcite RexNodes and then get converted back into ASTNodes. After the optimization step is done, the types chosen for each step should be the final type. This commit eliminates any type changes done on the conversion back to the ASTNode. A couple of ptest files changed with this commit. The reason for the change is because there was an extra constant fold done on the conversion back to the ASTNode. This constant fold should have been done at optimization time. A Jira will be filed for this, but there is no adverse effect for this issue. --- parser/pom.xml | 5 +++++ .../org/apache/hadoop/hive/ql/parse/ASTNode.java | 13 +++++++++++++ .../optimizer/calcite/translator/ASTConverter.java | 4 ++-- .../calcite/translator/SqlFunctionConverter.java | 4 +++- .../hive/ql/parse/type/TypeCheckProcFactory.java | 17 +++++++++-------- .../hive/ql/udf/generic/GenericUDFBaseNumeric.java | 22 +++++++++++++++++++++- .../perf/tpcds30tb/tez/query23.q.out | 4 ++-- .../clientpositive/perf/tpcds30tb/tez/query6.q.out | 4 ++-- 8 files changed, 57 insertions(+), 16 deletions(-) diff --git a/parser/pom.xml b/parser/pom.xml index 41fee3b..b02476a 100644 --- a/parser/pom.xml +++ b/parser/pom.xml @@ -62,6 +62,11 @@ <version>${junit.version}</version> <scope>test</scope> </dependency> + <dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-serde</artifactId> + <version>${project.version}</version> + </dependency> </dependencies> <build> diff --git a/parser/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java b/parser/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java index f51de08..802872f 100644 --- a/parser/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java +++ b/parser/src/java/org/apache/hadoop/hive/ql/parse/ASTNode.java @@ -33,6 +33,7 @@ import org.antlr.runtime.tree.Tree; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** * @@ -46,6 +47,10 @@ public class ASTNode extends CommonTree implements Node,Serializable { private transient ASTNode rootNode; private transient boolean isValidASTStr; private transient boolean visited = false; + // At parsing type, the typeInfo isn't known. However, Hive has logic that converts + // the CBO plan back into ASTNode objects, and at this point, the typeInfo has + // been calculated by the optimizer. + private transient TypeInfo typeInfo; private static final Interner<ImmutableCommonToken> TOKEN_CACHE = Interners.newWeakInterner(); @@ -155,6 +160,14 @@ public class ASTNode extends CommonTree implements Node,Serializable { this.origin = origin; } + public void setTypeInfo(TypeInfo typeInfo) { + this.typeInfo = typeInfo; + } + + public TypeInfo getTypeInfo() { + return typeInfo; + } + public String dump() { StringBuilder sb = new StringBuilder("\n"); dump(sb); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java index 7073bca..78ecd17 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java @@ -803,7 +803,7 @@ public class ASTConverter { astNodeLst.add(operand.accept(this)); } return SqlFunctionConverter.buildAST(SqlStdOperatorTable.NOT, - Collections.singletonList(SqlFunctionConverter.buildAST(SqlStdOperatorTable.IS_NOT_DISTINCT_FROM, astNodeLst))); + Collections.singletonList(SqlFunctionConverter.buildAST(SqlStdOperatorTable.IS_NOT_DISTINCT_FROM, astNodeLst, call.getType())), call.getType()); case CAST: assert(call.getOperands().size() == 1); if (call.getType().isStruct() || @@ -850,7 +850,7 @@ public class ASTConverter { if (isFlat(call)) { return SqlFunctionConverter.buildAST(op, astNodeLst, 0); } else { - return SqlFunctionConverter.buildAST(op, astNodeLst); + return SqlFunctionConverter.buildAST(op, astNodeLst, call.getType()); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java index 5b147b6..62f10f6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java @@ -236,7 +236,7 @@ public class SqlFunctionConverter { // TODO: 1) handle Agg Func Name translation 2) is it correct to add func // args as child of func? - public static ASTNode buildAST(SqlOperator op, List<ASTNode> children) { + public static ASTNode buildAST(SqlOperator op, List<ASTNode> children, RelDataType type) { HiveToken hToken = calciteToHiveToken.get(op); ASTNode node; if (hToken != null) { @@ -293,6 +293,8 @@ public class SqlFunctionConverter { } } + node.setTypeInfo(TypeConverter.convert(type)); + for (ASTNode c : children) { ParseDriver.adaptor.addChild(node, c); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/type/TypeCheckProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/type/TypeCheckProcFactory.java index e005ed9..6ca7ca2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/type/TypeCheckProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/type/TypeCheckProcFactory.java @@ -923,7 +923,7 @@ public class TypeCheckProcFactory<T> { } // Calculate TypeInfo - TypeInfo t = ((ListTypeInfo) myt).getListElementTypeInfo(); + TypeInfo t = node.getTypeInfo() != null ? node.getTypeInfo() : ((ListTypeInfo) myt).getListElementTypeInfo(); expr = exprFactory.createFuncCallExpr(t, fi, funcText, children); } else if (myt.getCategory() == Category.MAP) { if (!TypeInfoUtils.implicitConvertible(exprFactory.getTypeInfo(children.get(1)), @@ -932,7 +932,7 @@ public class TypeCheckProcFactory<T> { ErrorMsg.INVALID_MAPINDEX_TYPE.getMsg(), node)); } // Calculate TypeInfo - TypeInfo t = ((MapTypeInfo) myt).getMapValueTypeInfo(); + TypeInfo t = node.getTypeInfo() != null ? node.getTypeInfo() : ((MapTypeInfo) myt).getMapValueTypeInfo(); expr = exprFactory.createFuncCallExpr(t, fi, funcText, children); } else { throw new SemanticException(ASTErrorUtils.getMsg( @@ -1026,7 +1026,7 @@ public class TypeCheckProcFactory<T> { } else { FunctionInfo inFunctionInfo = exprFactory.getFunctionInfo("in"); for (Collection<T> c : expressions.asMap().values()) { - newExprs.add(exprFactory.createFuncCallExpr(null, inFunctionInfo, + newExprs.add(exprFactory.createFuncCallExpr(node.getTypeInfo(), inFunctionInfo, "in", (List<T>) c)); } children.addAll(newExprs); @@ -1048,7 +1048,7 @@ public class TypeCheckProcFactory<T> { childrenList.add(child); } } - expr = exprFactory.createFuncCallExpr(null, fi, funcText, childrenList); + expr = exprFactory.createFuncCallExpr(node.getTypeInfo(), fi, funcText, childrenList); } else if (exprFactory.isAndFunction(fi)) { // flatten AND List<T> childrenList = new ArrayList<>(children.size()); @@ -1062,18 +1062,19 @@ public class TypeCheckProcFactory<T> { childrenList.add(child); } } - expr = exprFactory.createFuncCallExpr(null, fi, funcText, childrenList); + expr = exprFactory.createFuncCallExpr(node.getTypeInfo(), fi, funcText, childrenList); } else if (ctx.isFoldExpr() && exprFactory.convertCASEIntoCOALESCEFuncCallExpr(fi, children)) { // Rewrite CASE into COALESCE fi = exprFactory.getFunctionInfo("coalesce"); - expr = exprFactory.createFuncCallExpr(null, fi, "coalesce", + expr = exprFactory.createFuncCallExpr(node.getTypeInfo(), fi, "coalesce", Lists.newArrayList(children.get(0), exprFactory.createBooleanConstantExpr(Boolean.FALSE.toString()))); if (Boolean.FALSE.equals(exprFactory.getConstantValue(children.get(1)))) { fi = exprFactory.getFunctionInfo("not"); - expr = exprFactory.createFuncCallExpr(null, fi, "not", Lists.newArrayList(expr)); + expr = exprFactory.createFuncCallExpr(node.getTypeInfo(), fi, "not", Lists.newArrayList(expr)); } } else { - expr = exprFactory.createFuncCallExpr(typeInfo, fi, funcText, children); + TypeInfo t = (node.getTypeInfo() != null) ? node.getTypeInfo() : typeInfo; + expr = exprFactory.createFuncCallExpr(t, fi, funcText, children); } if (exprFactory.isSTRUCTFuncCallExpr(expr)) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBaseNumeric.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBaseNumeric.java index a7bc658..34a03ed 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBaseNumeric.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBaseNumeric.java @@ -30,6 +30,7 @@ import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.ql.udf.SettableUDF; import org.apache.hadoop.hive.serde2.io.ByteWritable; import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; @@ -59,7 +60,7 @@ import org.apache.hive.common.HiveCompat.CompatLevel; * GenericUDF Base Class for operations. */ @Description(name = "op", value = "a op b - Returns the result of operation") -public abstract class GenericUDFBaseNumeric extends GenericUDFBaseBinary { +public abstract class GenericUDFBaseNumeric extends GenericUDFBaseBinary implements SettableUDF { protected transient PrimitiveObjectInspector leftOI; protected transient PrimitiveObjectInspector rightOI; @@ -79,6 +80,12 @@ public abstract class GenericUDFBaseNumeric extends GenericUDFBaseBinary { protected boolean confLookupNeeded = true; protected boolean ansiSqlArithmetic = false; + // A typeinfo that has already been defined. On the first pass when we create the class + // from the ASTNode objects, the typeInfo is unknown. However, once CBO is run, CBO + // defines the type that should be used. We don't want to recalculate the typeInfo when + // we convert it back to ASTNodes, so the typeInfo is "predefined" in this case. + private TypeInfo predefinedTypeInfo; + public GenericUDFBaseNumeric() { } @@ -208,6 +215,9 @@ public abstract class GenericUDFBaseNumeric extends GenericUDFBaseBinary { * @throws UDFArgumentException */ private PrimitiveTypeInfo deriveResultTypeInfo() throws UDFArgumentException { + if (predefinedTypeInfo instanceof PrimitiveTypeInfo && predefinedTypeInfo != null) { + return (PrimitiveTypeInfo) predefinedTypeInfo; + } PrimitiveTypeInfo left = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromObjectInspector(leftOI); PrimitiveTypeInfo right = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromObjectInspector(rightOI); if (!FunctionRegistry.isNumericType(left) || !FunctionRegistry.isNumericType(right)) { @@ -339,4 +349,14 @@ public abstract class GenericUDFBaseNumeric extends GenericUDFBaseBinary { public void setAnsiSqlArithmetic(boolean ansiSqlArithmetic) { this.ansiSqlArithmetic = ansiSqlArithmetic; } + + @Override + public void setTypeInfo(TypeInfo typeInfo) { + this.predefinedTypeInfo = typeInfo; + } + + @Override + public TypeInfo getTypeInfo() { + return this.predefinedTypeInfo; + } } diff --git a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query23.q.out b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query23.q.out index 0518cc1..9345848 100644 --- a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query23.q.out +++ b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query23.q.out @@ -407,14 +407,14 @@ STAGE PLANS: predicate: _col0 is not null (type: boolean) Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: (0.95 * _col0) (type: decimal(31,4)) + expressions: (0.95 * _col0) (type: decimal(37,8)) outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator null sort order: sort order: Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE - value expressions: _col0 (type: decimal(31,4)) + value expressions: _col0 (type: decimal(37,8)) Reducer 12 Execution mode: vectorized, llap Reduce Operator Tree: diff --git a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query6.q.out b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query6.q.out index ec0a2fe..75ded7a 100644 --- a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query6.q.out +++ b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query6.q.out @@ -277,7 +277,7 @@ STAGE PLANS: predicate: CAST( CAST( (_col1 / _col2) AS decimal(11,6)) AS decimal(16,6)) is not null (type: boolean) Statistics: Num rows: 11 Data size: 2310 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: _col0 (type: char(50)), (1.2 * CAST( CAST( (_col1 / _col2) AS decimal(11,6)) AS decimal(16,6))) (type: decimal(19,7)) + expressions: _col0 (type: char(50)), (1.2 * CAST( CAST( (_col1 / _col2) AS decimal(11,6)) AS decimal(16,6))) (type: decimal(14,7)) outputColumnNames: _col0, _col1 Statistics: Num rows: 11 Data size: 2222 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator @@ -286,7 +286,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: char(50)) Statistics: Num rows: 11 Data size: 2222 Basic stats: COMPLETE Column stats: COMPLETE - value expressions: _col1 (type: decimal(19,7)) + value expressions: _col1 (type: decimal(14,7)) Reducer 13 Execution mode: vectorized, llap Reduce Operator Tree: