This is an automated email from the ASF dual-hosted git repository. szita pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new a017e54c98c HIVE-25848: Empty result for structs in point lookup optimization with vectorization on (#3592) (Gergely Hanko, reviewed by Alessandro Solimando and Adam Szita) a017e54c98c is described below commit a017e54c98c76ccf0c185b47533b336b0a398dc7 Author: ghanko <54805928+gha...@users.noreply.github.com> AuthorDate: Thu Sep 22 13:51:47 2022 +0200 HIVE-25848: Empty result for structs in point lookup optimization with vectorization on (#3592) (Gergely Hanko, reviewed by Alessandro Solimando and Adam Szita) --- .../hive/ql/exec/vector/VectorizationContext.java | 3 + .../expressions/FilterStructColumnInList.java | 11 +++- .../exec/vector/expressions/VectorExpression.java | 69 +++++++++------------- .../hadoop/hive/ql/udf/generic/GenericUDFIn.java | 9 ++- .../queries/clientpositive/vector_struct_in2.q | 26 ++++++++ .../clientpositive/llap/vector_struct_in2.q.out | 62 +++++++++++++++++++ 6 files changed, 137 insertions(+), 43 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index b5b0b764cd0..ba980f9375c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -2834,6 +2834,9 @@ import com.google.common.annotations.VisibleForTesting; private VectorExpression getInExpression(List<ExprNodeDesc> childExpr, VectorExpressionDescriptor.Mode mode, TypeInfo returnType) throws HiveException { ExprNodeDesc colExpr = childExpr.get(0); + if (colExpr instanceof ExprNodeConstantDesc) { + return null; + } List<ExprNodeDesc> inChildren = childExpr.subList(1, childExpr.size()); String colType = colExpr.getTypeString(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStructColumnInList.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStructColumnInList.java index ca1bf4291ea..115e7fab229 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStructColumnInList.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStructColumnInList.java @@ -18,7 +18,10 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; import java.util.List; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; @@ -53,7 +56,7 @@ public class FilterStructColumnInList extends FilterStringColumnInList implement /** * After construction you must call setInListValues() to add the values to the IN set * (on the IStringInExpr interface). - * + * <p> * And, call a and b on the IStructInExpr interface. */ public FilterStructColumnInList() { @@ -174,4 +177,10 @@ public class FilterStructColumnInList extends FilterStringColumnInList implement ", structColumnMap " + Arrays.toString(structColumnMap); } + protected Collection<VectorExpression> getChildExpressionsForTransientInit() { + Collection<VectorExpression> result = new ArrayList<>(super.getChildExpressionsForTransientInit()); + Collections.addAll(result, structExpressions); + return result; + } + } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java index 1aa7397b738..800a96f4eb3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorExpression.java @@ -20,9 +20,11 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; import java.io.Serializable; import java.nio.charset.StandardCharsets; -import java.util.ArrayList; +import java.util.ArrayDeque; +import java.util.Arrays; +import java.util.Collection; import java.util.Collections; -import java.util.List; +import java.util.Deque; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.type.DataTypePhysicalVariation; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; @@ -36,32 +38,32 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; /** * Base class for vector expressions. - * + * <p> * A vector expression is a vectorized execution tree that evaluates the same result as a (row-mode) * ExprNodeDesc tree describes. - * + * <p> * A vector expression has 0, 1, or more parameters and an optional output column. These are * normally passed to the vector expression object' constructor. A few special case classes accept * extra parameters via set* method. - * + * <p> * A ExprNodeColumnDesc vectorizes to the IdentityExpression class where the input column number * parameter is the same as the output column number. - * + * <p> * A ExprNodeGenericFuncDesc's generic function can vectorize to many different vectorized objects * depending on the parameter expression kinds (column, constant, etc) and data types. Each * vectorized class implements the getDecription which indicates the particular expression kind * and data type specialization that class is designed for. The Description is used by the * VectorizationContext class in matching the right vectorized class. - * + * <p> * The constructor parameters need to be in the same order as the generic function because * the VectorizationContext class automates parameter generation and object construction. - * + * <p> * Type information is remembered for the input parameters and the output type. - * + * <p> * A vector expression has optional children vector expressions when 1 or more parameters need * to be calculated into vector scratch columns. Columns and constants do not need children * expressions. - * + * <p> * HOW TO to extend VectorExpression (some basic steps and hints): * 1. Create a subclass, and write a proper getDescriptor() (column/scalar?, number for args?, etc.) * 2. Define an explicit parameterless constructor @@ -78,7 +80,7 @@ public abstract class VectorExpression implements Serializable { /** * Child expressions for parameters -- but only those that need to be computed. - * + * <p> * NOTE: Columns and constants are not included in the children. That is: column numbers and * scalar values are passed via the constructor and remembered by the individual vector expression * classes. They are not represented in the children. @@ -88,7 +90,7 @@ public abstract class VectorExpression implements Serializable { /** * ALL input parameter type information is here including those for (non-computed) columns and * scalar values. - * + * <p> * The vectorExpressionParameters() method is used to get the displayable string for the * parameters used by EXPLAIN, logging, etc. */ @@ -106,7 +108,7 @@ public abstract class VectorExpression implements Serializable { /** * Input column numbers of the vector expression, which should be reused by vector expressions. */ - public int inputColumnNum[] = {-1, -1, -1}; + public int[] inputColumnNum = {-1, -1, -1}; /* * Use this constructor when there is NO output column. @@ -125,9 +127,6 @@ public abstract class VectorExpression implements Serializable { /** * Constructor for 1 input column and 1 output column. - * - * @param inputColumnNum - * @param outputColumnNum */ public VectorExpression(int inputColumnNum, int outputColumnNum) { this(inputColumnNum, -1, -1, outputColumnNum); @@ -135,10 +134,6 @@ public abstract class VectorExpression implements Serializable { /** * Constructor for 2 input columns and 1 output column. - * - * @param inputColumnNum - * @param inputColumnNum2 - * @param outputColumnNum */ public VectorExpression(int inputColumnNum, int inputColumnNum2, int outputColumnNum) { this(inputColumnNum, inputColumnNum2, -1, outputColumnNum); @@ -148,11 +143,6 @@ public abstract class VectorExpression implements Serializable { * Constructor for 3 input columns and 1 output column. Currently, VectorExpression is initialized * for a maximum of 3 input columns. In case an expression with more than 3 columns wants to reuse * logic here, inputColumnNum* fields should be extended. - * - * @param inputColumnNum - * @param inputColumnNum2 - * @param inputColumnNum3 - * @param outputColumnNum */ public VectorExpression(int inputColumnNum, int inputColumnNum2, int inputColumnNum3, int outputColumnNum) { // By default, no children or inputs. @@ -172,10 +162,8 @@ public abstract class VectorExpression implements Serializable { /** * Convenience method for expressions that uses arbitrary number of input columns in an array. - * @param inputColumnNum - * @param outputColumnNum */ - public VectorExpression(int inputColumnNum[], int outputColumnNum) { + public VectorExpression(int[] inputColumnNum, int outputColumnNum) { // By default, no children or inputs. childExpressions = null; inputTypeInfos = null; @@ -200,6 +188,14 @@ public abstract class VectorExpression implements Serializable { return childExpressions; } + protected Collection<VectorExpression> getChildExpressionsForTransientInit() { + if (getChildExpressions() != null) { + return Arrays.asList(getChildExpressions()); + } else { + return Collections.emptyList(); + } + } + //------------------------------------------------------------------------------------------------ public void setInputTypeInfos(TypeInfo ...inputTypeInfos) { @@ -251,17 +247,12 @@ public abstract class VectorExpression implements Serializable { // Well, don't recurse but make sure all children are initialized. vecExpr.transientInit(conf); - List<VectorExpression> newChildren = new ArrayList<VectorExpression>(); - VectorExpression[] children = vecExpr.getChildExpressions(); - if (children != null) { - Collections.addAll(newChildren, children); - } + + Deque<VectorExpression> newChildren = new ArrayDeque<>(vecExpr.getChildExpressionsForTransientInit()); + while (!newChildren.isEmpty()) { - VectorExpression childVecExpr = newChildren.remove(0); - children = childVecExpr.getChildExpressions(); - if (children != null) { - Collections.addAll(newChildren, children); - } + VectorExpression childVecExpr = newChildren.removeFirst(); + newChildren.addAll(childVecExpr.getChildExpressionsForTransientInit()); childVecExpr.transientInit(conf); } } @@ -309,8 +300,6 @@ public abstract class VectorExpression implements Serializable { } /** * This is the primary method to implement expression logic. - * @param batch - * @throws HiveException */ public abstract void evaluate(VectorizedRowBatch batch) throws HiveException; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIn.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIn.java index 24852e1b728..fcfb9c7cb04 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIn.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIn.java @@ -184,8 +184,13 @@ public class GenericUDFIn extends GenericUDF { break; } case STRUCT: { - if (constantInSet.contains(((StructObjectInspector) compareOI).getStructFieldsDataAsList(conversionHelper - .convertIfNecessary(arguments[0].get(), argumentOIs[0])))) { + Object value; + if (argumentOIs[0] instanceof ConstantObjectInspector) { + value = ((ConstantObjectInspector) argumentOIs[0]).getWritableConstantValue(); + } else { + value = conversionHelper.convertIfNecessary(arguments[0].get(), argumentOIs[0]); + } + if (constantInSet.contains(((StructObjectInspector) compareOI).getStructFieldsDataAsList(value))) { bw.set(true); return bw; } diff --git a/ql/src/test/queries/clientpositive/vector_struct_in2.q b/ql/src/test/queries/clientpositive/vector_struct_in2.q new file mode 100644 index 00000000000..8c5729240cd --- /dev/null +++ b/ql/src/test/queries/clientpositive/vector_struct_in2.q @@ -0,0 +1,26 @@ +set hive.fetch.task.conversion=none; + +create table test (a string) partitioned by (y string, m string); +insert into test values ('aa', 2022, 9); + +--=== original bug report, complex query =============================================================================== +select * from test where (y=year(date_sub('2022-09-11',4)) and m=month(date_sub('2022-09-11',4))) or (y=year(date_sub('2022-09-11',10)) and m=month(date_sub('2022-09-11',10)) ); + + +--=== simple test cases for the distinct causes of the failure of the complex query ==================================== + +--this is needed not to optimize away the problematic parts of the queries +set hive.cbo.enable=false; + +--embedded expression in struct - used to yield empty result +select * from test where (struct(cast(y as int)) IN (struct(2022))); + +--first argument of in expression is const struct - used to yield empty result +select * from test where (struct(2022) IN (struct(2022))); + +--these are needed not to optimize away the problematic part of the query +set hive.optimize.constant.propagation=false; +set hive.optimize.ppd=false; + +--first argument of in expression is const primitive - used to cause error +select * from test where (2022 IN (2022)); diff --git a/ql/src/test/results/clientpositive/llap/vector_struct_in2.q.out b/ql/src/test/results/clientpositive/llap/vector_struct_in2.q.out new file mode 100644 index 00000000000..a35c426d68a --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/vector_struct_in2.q.out @@ -0,0 +1,62 @@ +PREHOOK: query: create table test (a string) partitioned by (y string, m string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test +POSTHOOK: query: create table test (a string) partitioned by (y string, m string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test +PREHOOK: query: insert into test values ('aa', 2022, 9) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test +POSTHOOK: query: insert into test values ('aa', 2022, 9) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test +POSTHOOK: Output: default@test@y=2022/m=9 +POSTHOOK: Lineage: test PARTITION(y=2022,m=9).a SCRIPT [] +PREHOOK: query: select * from test where (y=year(date_sub('2022-09-11',4)) and m=month(date_sub('2022-09-11',4))) or (y=year(date_sub('2022-09-11',10)) and m=month(date_sub('2022-09-11',10)) ) +PREHOOK: type: QUERY +PREHOOK: Input: default@test +PREHOOK: Input: default@test@y=2022/m=9 +#### A masked pattern was here #### +POSTHOOK: query: select * from test where (y=year(date_sub('2022-09-11',4)) and m=month(date_sub('2022-09-11',4))) or (y=year(date_sub('2022-09-11',10)) and m=month(date_sub('2022-09-11',10)) ) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test +POSTHOOK: Input: default@test@y=2022/m=9 +#### A masked pattern was here #### +aa 2022 9 +PREHOOK: query: select * from test where (struct(cast(y as int)) IN (struct(2022))) +PREHOOK: type: QUERY +PREHOOK: Input: default@test +PREHOOK: Input: default@test@y=2022/m=9 +#### A masked pattern was here #### +POSTHOOK: query: select * from test where (struct(cast(y as int)) IN (struct(2022))) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test +POSTHOOK: Input: default@test@y=2022/m=9 +#### A masked pattern was here #### +aa 2022 9 +PREHOOK: query: select * from test where (struct(2022) IN (struct(2022))) +PREHOOK: type: QUERY +PREHOOK: Input: default@test +PREHOOK: Input: default@test@y=2022/m=9 +#### A masked pattern was here #### +POSTHOOK: query: select * from test where (struct(2022) IN (struct(2022))) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test +POSTHOOK: Input: default@test@y=2022/m=9 +#### A masked pattern was here #### +aa 2022 9 +PREHOOK: query: select * from test where (2022 IN (2022)) +PREHOOK: type: QUERY +PREHOOK: Input: default@test +PREHOOK: Input: default@test@y=2022/m=9 +#### A masked pattern was here #### +POSTHOOK: query: select * from test where (2022 IN (2022)) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test +POSTHOOK: Input: default@test@y=2022/m=9 +#### A masked pattern was here #### +aa 2022 9