HIVE-15884: Optimize not between for vectorization (Pengcheng Xiong, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/5f533bce Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/5f533bce Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/5f533bce Branch: refs/heads/branch-2.2 Commit: 5f533bcebfdfe0d79a2b41de7fc72bd8c1b366c4 Parents: fb1c9fd Author: Pengcheng Xiong <pxi...@apache.org> Authored: Thu Mar 2 11:16:34 2017 -0800 Committer: Owen O'Malley <omal...@apache.org> Committed: Tue Mar 28 15:27:56 2017 -0700 ---------------------------------------------------------------------- .../ql/exec/vector/VectorizationContext.java | 24 +++++++++++++++++++- .../clientpositive/llap/vector_between_in.q.out | 6 ++--- .../spark/vector_between_in.q.out | 12 +++++----- .../clientpositive/tez/vector_between_in.q.out | 6 ++--- .../clientpositive/vector_between_in.q.out | 6 ++--- 5 files changed, 38 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/5f533bce/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index f81a0fb..d8387bf 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -102,7 +102,6 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicValueDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; -import org.apache.hadoop.hive.ql.udf.SettableUDF; import org.apache.hadoop.hive.ql.udf.*; import org.apache.hadoop.hive.ql.udf.generic.*; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; @@ -582,6 +581,29 @@ public class VectorizationContext { ve = getColumnVectorExpression((ExprNodeColumnDesc) exprDesc, mode); } else if (exprDesc instanceof ExprNodeGenericFuncDesc) { ExprNodeGenericFuncDesc expr = (ExprNodeGenericFuncDesc) exprDesc; + // push not through between... + if ("not".equals(expr.getFuncText())) { + if (expr.getChildren() != null && expr.getChildren().size() == 1) { + ExprNodeDesc child = expr.getChildren().get(0); + if (child instanceof ExprNodeGenericFuncDesc) { + ExprNodeGenericFuncDesc childExpr = (ExprNodeGenericFuncDesc) child; + if ("between".equals(childExpr.getFuncText())) { + ExprNodeConstantDesc flag = (ExprNodeConstantDesc) childExpr.getChildren().get(0); + List<ExprNodeDesc> newChildren = new ArrayList<>(); + if (Boolean.TRUE.equals(flag.getValue())) { + newChildren.add(new ExprNodeConstantDesc(Boolean.FALSE)); + } else { + newChildren.add(new ExprNodeConstantDesc(Boolean.TRUE)); + } + newChildren + .addAll(childExpr.getChildren().subList(1, childExpr.getChildren().size())); + expr.setTypeInfo(childExpr.getTypeInfo()); + expr.setGenericUDF(childExpr.getGenericUDF()); + expr.setChildren(newChildren); + } + } + } + } if (isCustomUDF(expr)) { ve = getCustomUDFExpression(expr, mode); } else { http://git-wip-us.apache.org/repos/asf/hive/blob/5f533bce/ql/src/test/results/clientpositive/llap/vector_between_in.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out index 88d97f5..7f9067d 100644 --- a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out +++ b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out @@ -317,7 +317,7 @@ STAGE PLANS: alias: decimal_date_test Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (not cdate BETWEEN 1968-05-01 AND 1971-09-01) (type: boolean) + predicate: cdate NOT BETWEEN 1968-05-01 AND 1971-09-01 (type: boolean) Statistics: Num rows: 10923 Data size: 2193503 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cdate (type: date) @@ -427,7 +427,7 @@ STAGE PLANS: alias: decimal_date_test Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (not cdecimal1 BETWEEN -2000 AND 4390.1351351351) (type: boolean) + predicate: cdecimal1 NOT BETWEEN -2000 AND 4390.1351351351 (type: boolean) Statistics: Num rows: 10923 Data size: 2193503 Basic stats: COMPLETE Column stats: NONE Select Operator Statistics: Num rows: 10923 Data size: 2193503 Basic stats: COMPLETE Column stats: NONE @@ -917,7 +917,7 @@ STAGE PLANS: alias: decimal_date_test Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: (not cdecimal1 BETWEEN -2000 AND 4390.1351351351) (type: boolean) + expressions: cdecimal1 NOT BETWEEN -2000 AND 4390.1351351351 (type: boolean) outputColumnNames: _col0 Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE Group By Operator http://git-wip-us.apache.org/repos/asf/hive/blob/5f533bce/ql/src/test/results/clientpositive/spark/vector_between_in.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/vector_between_in.q.out b/ql/src/test/results/clientpositive/spark/vector_between_in.q.out index 689707f..efbca8c 100644 --- a/ql/src/test/results/clientpositive/spark/vector_between_in.q.out +++ b/ql/src/test/results/clientpositive/spark/vector_between_in.q.out @@ -311,8 +311,8 @@ STAGE PLANS: Filter Vectorization: className: VectorFilterOperator native: true - predicateExpression: SelectColumnIsFalse(col 4)(children: VectorUDFAdaptor(cdate BETWEEN 1968-05-01 AND 1971-09-01) -> 4:boolean) -> boolean - predicate: (not cdate BETWEEN 1968-05-01 AND 1971-09-01) (type: boolean) + predicateExpression: FilterLongColumnNotBetween(col 3, left -610, right 608) -> boolean + predicate: cdate NOT BETWEEN 1968-05-01 AND 1971-09-01 (type: boolean) Statistics: Num rows: 10923 Data size: 2193503 Basic stats: COMPLETE Column stats: NONE >>>>>>> eb1da30... HIVE-15388: HiveParser spends lots of time in parsing >>>>>>> queries with lots of '(' (Pengcheng Xiong, reviewed by Ashutosh >>>>>>> Chauhan, Gunther Hagleitner) Select Operator @@ -332,7 +332,7 @@ STAGE PLANS: groupByVectorOutput: true inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat allNative: false - usesVectorUDFAdaptor: true + usesVectorUDFAdaptor: false vectorized: true >>>>>>> eb1da30... HIVE-15388: HiveParser spends lots of time in parsing >>>>>>> queries with lots of '(' (Pengcheng Xiong, reviewed by Ashutosh >>>>>>> Chauhan, Gunther Hagleitner) Reducer 2 @@ -437,8 +437,8 @@ STAGE PLANS: Filter Vectorization: className: VectorFilterOperator native: true - predicateExpression: SelectColumnIsFalse(col 4)(children: VectorUDFAdaptor(cdecimal1 BETWEEN -2000 AND 4390.1351351351) -> 4:boolean) -> boolean - predicate: (not cdecimal1 BETWEEN -2000 AND 4390.1351351351) (type: boolean) + predicateExpression: FilterDecimalColumnNotBetween(col 1, left -2000, right 4390.1351351351) -> boolean + predicate: cdecimal1 NOT BETWEEN -2000 AND 4390.1351351351 (type: boolean) Statistics: Num rows: 10923 Data size: 2193503 Basic stats: COMPLETE Column stats: NONE >>>>>>> eb1da30... HIVE-15388: HiveParser spends lots of time in parsing >>>>>>> queries with lots of '(' (Pengcheng Xiong, reviewed by Ashutosh >>>>>>> Chauhan, Gunther Hagleitner) Select Operator @@ -461,7 +461,7 @@ STAGE PLANS: groupByVectorOutput: true inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat allNative: false - usesVectorUDFAdaptor: true + usesVectorUDFAdaptor: false vectorized: true >>>>>>> eb1da30... HIVE-15388: HiveParser spends lots of time in parsing >>>>>>> queries with lots of '(' (Pengcheng Xiong, reviewed by Ashutosh >>>>>>> Chauhan, Gunther Hagleitner) Reducer 2 http://git-wip-us.apache.org/repos/asf/hive/blob/5f533bce/ql/src/test/results/clientpositive/tez/vector_between_in.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/vector_between_in.q.out b/ql/src/test/results/clientpositive/tez/vector_between_in.q.out index 8903337..57e6d1a 100644 --- a/ql/src/test/results/clientpositive/tez/vector_between_in.q.out +++ b/ql/src/test/results/clientpositive/tez/vector_between_in.q.out @@ -310,7 +310,7 @@ STAGE PLANS: alias: decimal_date_test Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (not cdate BETWEEN 1968-05-01 AND 1971-09-01) (type: boolean) + predicate: cdate NOT BETWEEN 1968-05-01 AND 1971-09-01 (type: boolean) Statistics: Num rows: 10923 Data size: 2193503 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cdate (type: date) @@ -418,7 +418,7 @@ STAGE PLANS: alias: decimal_date_test Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (not cdecimal1 BETWEEN -2000 AND 4390.1351351351) (type: boolean) + predicate: cdecimal1 NOT BETWEEN -2000 AND 4390.1351351351 (type: boolean) Statistics: Num rows: 10923 Data size: 2193503 Basic stats: COMPLETE Column stats: NONE Select Operator Statistics: Num rows: 10923 Data size: 2193503 Basic stats: COMPLETE Column stats: NONE @@ -903,7 +903,7 @@ STAGE PLANS: alias: decimal_date_test Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: (not cdecimal1 BETWEEN -2000 AND 4390.1351351351) (type: boolean) + expressions: cdecimal1 NOT BETWEEN -2000 AND 4390.1351351351 (type: boolean) outputColumnNames: _col0 Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE Group By Operator http://git-wip-us.apache.org/repos/asf/hive/blob/5f533bce/ql/src/test/results/clientpositive/vector_between_in.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/vector_between_in.q.out b/ql/src/test/results/clientpositive/vector_between_in.q.out index 270de4b..cad29dd 100644 --- a/ql/src/test/results/clientpositive/vector_between_in.q.out +++ b/ql/src/test/results/clientpositive/vector_between_in.q.out @@ -264,7 +264,7 @@ STAGE PLANS: alias: decimal_date_test Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (not cdate BETWEEN 1968-05-01 AND 1971-09-01) (type: boolean) + predicate: cdate NOT BETWEEN 1968-05-01 AND 1971-09-01 (type: boolean) Statistics: Num rows: 10923 Data size: 2193503 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cdate (type: date) @@ -356,7 +356,7 @@ STAGE PLANS: alias: decimal_date_test Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (not cdecimal1 BETWEEN -2000 AND 4390.1351351351) (type: boolean) + predicate: cdecimal1 NOT BETWEEN -2000 AND 4390.1351351351 (type: boolean) Statistics: Num rows: 10923 Data size: 2193503 Basic stats: COMPLETE Column stats: NONE Select Operator Statistics: Num rows: 10923 Data size: 2193503 Basic stats: COMPLETE Column stats: NONE @@ -809,7 +809,7 @@ STAGE PLANS: alias: decimal_date_test Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: (not cdecimal1 BETWEEN -2000 AND 4390.1351351351) (type: boolean) + expressions: cdecimal1 NOT BETWEEN -2000 AND 4390.1351351351 (type: boolean) outputColumnNames: _col0 Statistics: Num rows: 12288 Data size: 2467616 Basic stats: COMPLETE Column stats: NONE Group By Operator