Repository: hive Updated Branches: refs/heads/branch-1 293e22e0e -> 78bedc8e2 refs/heads/branch-2.0 54760abdc -> 69440a62a
HIVE-13957 : vectorized IN is inconsistent with non-vectorized (at least for decimal in (string)) (Sergey Shelukhin, reviewed by Matt McCline) Conflicts: ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ql/src/test/results/clientpositive/spark/vector_between_in.q.out ql/src/test/results/clientpositive/tez/vector_between_in.q.out ql/src/test/results/clientpositive/vector_between_in.q.out Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/69440a62 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/69440a62 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/69440a62 Branch: refs/heads/branch-2.0 Commit: 69440a62af477e8c237030ca1ed0120c7dc7d787 Parents: 54760ab Author: Sergey Shelukhin <ser...@apache.org> Authored: Mon Jun 13 18:32:12 2016 -0700 Committer: Sergey Shelukhin <ser...@apache.org> Committed: Mon Jun 13 18:48:14 2016 -0700 ---------------------------------------------------------------------- .../ql/exec/vector/VectorizationContext.java | 30 ++++-- .../hive/ql/udf/generic/GenericUDFUtils.java | 52 +++++++-- .../clientpositive/vector_string_decimal.q | 21 ++++ .../spark/vector_between_in.q.out | 2 - .../clientpositive/tez/vector_between_in.q.out | 2 - .../clientpositive/vector_between_in.q.out | 2 - .../clientpositive/vector_string_decimal.q.out | 106 +++++++++++++++++++ .../hive/serde2/typeinfo/HiveDecimalUtils.java | 4 +- 8 files changed, 192 insertions(+), 27 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index 1eb960d..6601a87 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -494,8 +494,8 @@ public class VectorizationContext { * Given a udf and its children, return the common type to which the children's type should be * cast. */ - private TypeInfo getCommonTypeForChildExpressions(GenericUDF genericUdf, List<ExprNodeDesc> children, - TypeInfo returnType) { + private TypeInfo getCommonTypeForChildExpressions(GenericUDF genericUdf, + List<ExprNodeDesc> children, TypeInfo returnType) throws HiveException { TypeInfo commonType; if (genericUdf instanceof GenericUDFBaseCompare) { @@ -507,9 +507,20 @@ public class VectorizationContext { commonType = returnType; } } else if (genericUdf instanceof GenericUDFIn) { - - // Cast to the type of the first child - return children.get(0).getTypeInfo(); + TypeInfo colTi = children.get(0).getTypeInfo(); + if (colTi.getCategory() != Category.PRIMITIVE) { + return colTi; // Handled later, only struct will be supported. + } + TypeInfo opTi = GenericUDFUtils.deriveInType(children); + if (opTi == null || opTi.getCategory() != Category.PRIMITIVE) { + throw new HiveException("Cannot vectorize IN() - common type is " + opTi); + } + if (((PrimitiveTypeInfo)colTi).getPrimitiveCategory() != + ((PrimitiveTypeInfo)opTi).getPrimitiveCategory()) { + throw new HiveException("Cannot vectorize IN() - casting a column is not supported. " + + "Column type is " + colTi + " but the common type is " + opTi); + } + return colTi; } else { // The children type should be converted to return type commonType = returnType; @@ -606,6 +617,7 @@ public class VectorizationContext { } PrimitiveTypeInfo ptinfo = (PrimitiveTypeInfo) inputTypeInfo; int precision = getPrecisionForType(ptinfo); + // TODO: precision and scale would be practically invalid for string conversion (38,38) int scale = HiveDecimalUtils.getScaleForType(ptinfo); return new DecimalTypeInfo(precision, scale); } @@ -1496,8 +1508,8 @@ public class VectorizationContext { /** * Create a filter or boolean-valued expression for column IN ( <list-of-constants> ) */ - private VectorExpression getInExpression(List<ExprNodeDesc> childExpr, Mode mode, TypeInfo returnType) - throws HiveException { + private VectorExpression getInExpression(List<ExprNodeDesc> childExpr, + VectorExpressionDescriptor.Mode mode, TypeInfo returnType) throws HiveException { ExprNodeDesc colExpr = childExpr.get(0); List<ExprNodeDesc> inChildren = childExpr.subList(1, childExpr.size()); @@ -1505,7 +1517,7 @@ public class VectorizationContext { colType = VectorizationContext.mapTypeNameSynonyms(colType); TypeInfo colTypeInfo = TypeInfoUtils.getTypeInfoFromTypeString(colType); Category category = colTypeInfo.getCategory(); - if (category == Category.STRUCT){ + if (category == Category.STRUCT) { return getStructInExpression(childExpr, colExpr, colTypeInfo, inChildren, mode, returnType); } else if (category != Category.PRIMITIVE) { return null; @@ -1526,6 +1538,8 @@ public class VectorizationContext { // determine class Class<?> cl = null; + // TODO: the below assumes that all the arguments to IN are of the same type; + // non-vectorized validates that explicitly during UDF init. if (isIntFamily(colType)) { cl = (mode == Mode.FILTER ? FilterLongColumnInList.class : LongColumnInList.class); long[] inVals = new long[childrenForInList.size()]; http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java index 3bbe783..2c4c0d0 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java @@ -23,14 +23,17 @@ import java.lang.reflect.Method; import java.lang.reflect.ParameterizedType; import java.lang.reflect.Type; import java.util.HashMap; +import java.util.List; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.serde2.io.HiveCharWritable; import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.IdentityConverter; @@ -168,17 +171,7 @@ public final class GenericUDFUtils { return false; } - /** - * TODO: Hack fix until HIVE-5848 is addressed. non-exact type shouldn't be promoted - * to exact type, as FunctionRegistry.getCommonClass() might do. This corrects - * that. - */ - if (commonTypeInfo instanceof DecimalTypeInfo) { - if ((!FunctionRegistry.isExactNumericType((PrimitiveTypeInfo) oiTypeInfo)) || - (!FunctionRegistry.isExactNumericType((PrimitiveTypeInfo) rTypeInfo))) { - commonTypeInfo = TypeInfoFactory.doubleTypeInfo; - } - } + commonTypeInfo = updateCommonTypeForDecimal(commonTypeInfo, oiTypeInfo, rTypeInfo); returnObjectInspector = TypeInfoUtils .getStandardWritableObjectInspectorFromTypeInfo(commonTypeInfo); @@ -239,6 +232,43 @@ public final class GenericUDFUtils { } + protected static TypeInfo updateCommonTypeForDecimal( + TypeInfo commonTypeInfo, TypeInfo ti, TypeInfo returnType) { + /** + * TODO: Hack fix until HIVE-5848 is addressed. non-exact type shouldn't be promoted + * to exact type, as FunctionRegistry.getCommonClass() might do. This corrects + * that. + */ + if (commonTypeInfo instanceof DecimalTypeInfo) { + if ((!FunctionRegistry.isExactNumericType((PrimitiveTypeInfo)ti)) || + (!FunctionRegistry.isExactNumericType((PrimitiveTypeInfo)returnType))) { + return TypeInfoFactory.doubleTypeInfo; + } + } + return commonTypeInfo; + } + + // Based on update() above. + public static TypeInfo deriveInType(List<ExprNodeDesc> children) { + TypeInfo returnType = null; + for (ExprNodeDesc node : children) { + TypeInfo ti = node.getTypeInfo(); + if (ti.getCategory() == Category.PRIMITIVE + && ((PrimitiveTypeInfo)ti).getPrimitiveCategory() == PrimitiveCategory.VOID) { + continue; + } + if (returnType == null) { + returnType = ti; + continue; + } + if (returnType == ti) continue; + TypeInfo commonTypeInfo = FunctionRegistry.getCommonClass(returnType, ti); + if (commonTypeInfo == null) return null; + returnType = updateCommonTypeForDecimal(commonTypeInfo, ti, returnType); + } + return returnType; + } + /** * Convert parameters for the method if needed. */ http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/ql/src/test/queries/clientpositive/vector_string_decimal.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/vector_string_decimal.q b/ql/src/test/queries/clientpositive/vector_string_decimal.q new file mode 100644 index 0000000..e69cd77 --- /dev/null +++ b/ql/src/test/queries/clientpositive/vector_string_decimal.q @@ -0,0 +1,21 @@ +set hive.vectorized.execution.enabled=false; +set hive.fetch.task.conversion=none; + +drop table orc_decimal; +drop table staging; +create table orc_decimal (id decimal(18,0)) stored as orc; + +create table staging (id decimal(18,0)); + +insert into staging values (34324.0), (100000000.0), (200000000.0), (300000000.0); + +insert overwrite table orc_decimal select id from staging; + +set hive.vectorized.execution.enabled=true; + +explain +select * from orc_decimal where id in ('100000000', '200000000'); +select * from orc_decimal where id in ('100000000', '200000000'); + +drop table orc_decimal; +drop table staging; http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/ql/src/test/results/clientpositive/spark/vector_between_in.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/vector_between_in.q.out b/ql/src/test/results/clientpositive/spark/vector_between_in.q.out index f1ff784..71e13ab 100644 --- a/ql/src/test/results/clientpositive/spark/vector_between_in.q.out +++ b/ql/src/test/results/clientpositive/spark/vector_between_in.q.out @@ -149,7 +149,6 @@ STAGE PLANS: key expressions: _col0 (type: decimal(20,10)) sort order: + Statistics: Num rows: 6144 Data size: 1233808 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized Reducer 2 Execution mode: vectorized Reduce Operator Tree: @@ -205,7 +204,6 @@ STAGE PLANS: sort order: Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: bigint) - Execution mode: vectorized Reducer 2 Execution mode: vectorized Reduce Operator Tree: http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/ql/src/test/results/clientpositive/tez/vector_between_in.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/vector_between_in.q.out b/ql/src/test/results/clientpositive/tez/vector_between_in.q.out index 9466ab2..b8be37e 100644 --- a/ql/src/test/results/clientpositive/tez/vector_between_in.q.out +++ b/ql/src/test/results/clientpositive/tez/vector_between_in.q.out @@ -152,7 +152,6 @@ STAGE PLANS: key expressions: _col0 (type: decimal(20,10)) sort order: + Statistics: Num rows: 6144 Data size: 1233808 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized Reducer 2 Execution mode: vectorized Reduce Operator Tree: @@ -209,7 +208,6 @@ STAGE PLANS: sort order: Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: bigint) - Execution mode: vectorized Reducer 2 Execution mode: vectorized Reduce Operator Tree: http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/ql/src/test/results/clientpositive/vector_between_in.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/vector_between_in.q.out b/ql/src/test/results/clientpositive/vector_between_in.q.out index b80da1b..d14e0f2 100644 --- a/ql/src/test/results/clientpositive/vector_between_in.q.out +++ b/ql/src/test/results/clientpositive/vector_between_in.q.out @@ -130,7 +130,6 @@ STAGE PLANS: key expressions: _col0 (type: decimal(20,10)) sort order: + Statistics: Num rows: 6144 Data size: 1233808 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: decimal(20,10)) @@ -179,7 +178,6 @@ STAGE PLANS: sort order: Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: bigint) - Execution mode: vectorized Reduce Operator Tree: Group By Operator aggregations: count(VALUE._col0) http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/ql/src/test/results/clientpositive/vector_string_decimal.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/vector_string_decimal.q.out b/ql/src/test/results/clientpositive/vector_string_decimal.q.out new file mode 100644 index 0000000..e0a3563 --- /dev/null +++ b/ql/src/test/results/clientpositive/vector_string_decimal.q.out @@ -0,0 +1,106 @@ +PREHOOK: query: drop table orc_decimal +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table orc_decimal +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table staging +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table staging +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table orc_decimal (id decimal(18,0)) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_decimal +POSTHOOK: query: create table orc_decimal (id decimal(18,0)) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_decimal +PREHOOK: query: create table staging (id decimal(18,0)) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@staging +POSTHOOK: query: create table staging (id decimal(18,0)) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@staging +PREHOOK: query: insert into staging values (34324.0), (100000000.0), (200000000.0), (300000000.0) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@staging +POSTHOOK: query: insert into staging values (34324.0), (100000000.0), (200000000.0), (300000000.0) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@staging +POSTHOOK: Lineage: staging.id EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert overwrite table orc_decimal select id from staging +PREHOOK: type: QUERY +PREHOOK: Input: default@staging +PREHOOK: Output: default@orc_decimal +POSTHOOK: query: insert overwrite table orc_decimal select id from staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@staging +POSTHOOK: Output: default@orc_decimal +POSTHOOK: Lineage: orc_decimal.id SIMPLE [(staging)staging.FieldSchema(name:id, type:decimal(18,0), comment:null), ] +PREHOOK: query: explain +select * from orc_decimal where id in ('100000000', '200000000') +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from orc_decimal where id in ('100000000', '200000000') +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: orc_decimal + Statistics: Num rows: 4 Data size: 448 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (id) IN ('100000000', '200000000') (type: boolean) + Statistics: Num rows: 2 Data size: 224 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: decimal(18,0)) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 224 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 224 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from orc_decimal where id in ('100000000', '200000000') +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_decimal +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_decimal where id in ('100000000', '200000000') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_decimal +#### A masked pattern was here #### +100000000 +200000000 +PREHOOK: query: drop table orc_decimal +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@orc_decimal +PREHOOK: Output: default@orc_decimal +POSTHOOK: query: drop table orc_decimal +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@orc_decimal +POSTHOOK: Output: default@orc_decimal +PREHOOK: query: drop table staging +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@staging +PREHOOK: Output: default@staging +POSTHOOK: query: drop table staging +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@staging +POSTHOOK: Output: default@staging http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java ---------------------------------------------------------------------- diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java index cdd20bb..5caaf6b 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java @@ -77,7 +77,7 @@ public class HiveDecimalUtils { case VOID: return 1; default: - return HiveDecimal.MAX_PRECISION; + return HiveDecimal.SYSTEM_DEFAULT_PRECISION; } } @@ -100,7 +100,7 @@ public class HiveDecimalUtils { case VOID: return 0; default: - return HiveDecimal.MAX_SCALE; + return HiveDecimal.SYSTEM_DEFAULT_SCALE; } }