This is an automated email from the ASF dual-hosted git repository. hashutosh pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 041b9c2 HIVE-24209 : Incorrect search argument conversion for NOT BETWEEN operation when vectorization is enabled (Ganesha Shreedhara via Ashutosh Chauhan) 041b9c2 is described below commit 041b9c25c3694c3fa09b132705eecccab96c6385 Author: Ganesha Shreedhara <ganeshashr...@gmail.com> AuthorDate: Mon Oct 5 21:39:18 2020 -0700 HIVE-24209 : Incorrect search argument conversion for NOT BETWEEN operation when vectorization is enabled (Ganesha Shreedhara via Ashutosh Chauhan) Signed-off-by: Ashutosh Chauhan <hashut...@apache.org> --- .../hive/ql/io/sarg/ConvertAstToSearchArg.java | 9 +- .../clientpositive/vector_between_columns.q | 43 +++++ .../llap/vector_between_columns.q.out | 211 +++++++++++++++++++++ 3 files changed, 262 insertions(+), 1 deletion(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java b/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java index 764c401..fd24eaa 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java @@ -471,7 +471,14 @@ public class ConvertAstToSearchArg { } else if (op == GenericUDFIn.class) { createLeaf(PredicateLeaf.Operator.IN, expr, 0); } else if (op == GenericUDFBetween.class) { - createLeaf(PredicateLeaf.Operator.BETWEEN, expr, 1); + // Start with NOT operator when the first child of GenericUDFBetween operator is set to TRUE + if (Boolean.TRUE.equals(((ExprNodeConstantDesc) expression.getChildren().get(0)).getValue())) { + builder.startNot(); + createLeaf(PredicateLeaf.Operator.BETWEEN, expr, 1); + builder.end(); + } else { + createLeaf(PredicateLeaf.Operator.BETWEEN, expr, 1); + } } else if (op == GenericUDFOPNull.class) { createLeaf(PredicateLeaf.Operator.IS_NULL, expr, 0); } else if (op == GenericUDFOPNotNull.class) { diff --git a/ql/src/test/queries/clientpositive/vector_between_columns.q b/ql/src/test/queries/clientpositive/vector_between_columns.q index a8e9ca4..63142f3 100644 --- a/ql/src/test/queries/clientpositive/vector_between_columns.q +++ b/ql/src/test/queries/clientpositive/vector_between_columns.q @@ -5,6 +5,7 @@ SET hive.auto.convert.join=true; set hive.fetch.task.conversion=none; set hive.mapred.mode=nonstrict; set hive.join.inner.residual=false; +set hive.optimize.index.filter=true; -- SORT_QUERY_RESULTS -- @@ -30,6 +31,43 @@ create table TINT stored as orc AS SELECT * FROM TINT_txt; -- Add a single NULL row that will come from ORC as isRepeated. insert into TINT values (NULL, NULL); +CREATE EXTERNAL TABLE test_orc_ppd( + data_release bigint, + data_owner_ver_id bigint, + data_owner_dim_id bigint, + data_source_ver_id bigint, + data_source_dim_id bigint, + data_client_ver_id bigint, + data_client_dim_id bigint, + data_client_sub_ver_id bigint, + data_client_sub_dim_id bigint, + quarter_dim_id bigint, + market_dim_id bigint, + daypart_dim_id bigint, + demo_dim_id bigint, + station_dim_id bigint, + medium_dim_id bigint, + ad_length int, + exclude int, + population int, + client_cpp double, + client_cpm double, + low_cpp double, + mid_cpp double, + high_cpp double, + low_cpm double, + mid_cpm double, + high_cpm double, + low_cpp_index double, + mid_cpp_index double, + high_cpp_index double, + low_cpm_index double, + mid_cpm_index double, + high_cpm_index double) + STORED AS ORC; +LOAD DATA LOCAL INPATH '../../data/files/orc_test_ppd' +OVERWRITE INTO TABLE test_orc_ppd; + explain vectorization expression select tint.rnum, tsint.rnum, tint.cint, tsint.csint, (case when (tint.cint between tsint.csint and tsint.csint) then "Ok" else "NoOk" end) as between_col from tint , tsint; @@ -40,3 +78,8 @@ explain vectorization expression select tint.rnum, tsint.rnum, tint.cint, tsint.csint from tint , tsint where tint.cint between tsint.csint and tsint.csint; select tint.rnum, tsint.rnum, tint.cint, tsint.csint from tint , tsint where tint.cint between tsint.csint and tsint.csint; + +explain vectorization expression +select data_release, count(*) from test_orc_ppd where NOT (data_release BETWEEN 20191201 AND 20200101) group by data_release; + +select data_release, count(*) from test_orc_ppd where NOT (data_release BETWEEN 20191201 AND 20200101) group by data_release; diff --git a/ql/src/test/results/clientpositive/llap/vector_between_columns.q.out b/ql/src/test/results/clientpositive/llap/vector_between_columns.q.out index 546dc45..cb6d521 100644 --- a/ql/src/test/results/clientpositive/llap/vector_between_columns.q.out +++ b/ql/src/test/results/clientpositive/llap/vector_between_columns.q.out @@ -82,6 +82,88 @@ POSTHOOK: Output: default@tint POSTHOOK: Lineage: tint.cint EXPRESSION [] POSTHOOK: Lineage: tint.rnum EXPRESSION [] _col0 _col1 +PREHOOK: query: CREATE EXTERNAL TABLE test_orc_ppd( + data_release bigint, +#### A masked pattern was here #### + data_source_ver_id bigint, + data_source_dim_id bigint, + data_client_ver_id bigint, + data_client_dim_id bigint, + data_client_sub_ver_id bigint, + data_client_sub_dim_id bigint, + quarter_dim_id bigint, + market_dim_id bigint, + daypart_dim_id bigint, + demo_dim_id bigint, + station_dim_id bigint, + medium_dim_id bigint, + ad_length int, + exclude int, + population int, + client_cpp double, + client_cpm double, + low_cpp double, + mid_cpp double, + high_cpp double, + low_cpm double, + mid_cpm double, + high_cpm double, + low_cpp_index double, + mid_cpp_index double, + high_cpp_index double, + low_cpm_index double, + mid_cpm_index double, + high_cpm_index double) + STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_orc_ppd +POSTHOOK: query: CREATE EXTERNAL TABLE test_orc_ppd( + data_release bigint, +#### A masked pattern was here #### + data_source_ver_id bigint, + data_source_dim_id bigint, + data_client_ver_id bigint, + data_client_dim_id bigint, + data_client_sub_ver_id bigint, + data_client_sub_dim_id bigint, + quarter_dim_id bigint, + market_dim_id bigint, + daypart_dim_id bigint, + demo_dim_id bigint, + station_dim_id bigint, + medium_dim_id bigint, + ad_length int, + exclude int, + population int, + client_cpp double, + client_cpm double, + low_cpp double, + mid_cpp double, + high_cpp double, + low_cpm double, + mid_cpm double, + high_cpm double, + low_cpp_index double, + mid_cpp_index double, + high_cpp_index double, + low_cpm_index double, + mid_cpm_index double, + high_cpm_index double) + STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_orc_ppd +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/orc_test_ppd' +OVERWRITE INTO TABLE test_orc_ppd +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@test_orc_ppd +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/orc_test_ppd' +OVERWRITE INTO TABLE test_orc_ppd +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@test_orc_ppd Warning: Shuffle Join MERGEJOIN[9][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product PREHOOK: query: explain vectorization expression select tint.rnum, tsint.rnum, tint.cint, tsint.csint, (case when (tint.cint between tsint.csint and tsint.csint) then "Ok" else "NoOk" end) as between_col from tint , tsint @@ -413,3 +495,132 @@ tint.rnum tsint.rnum tint.cint tsint.csint 2 2 0 0 3 3 1 1 4 4 10 10 +PREHOOK: query: explain vectorization expression +select data_release, count(*) from test_orc_ppd where NOT (data_release BETWEEN 20191201 AND 20200101) group by data_release +PREHOOK: type: QUERY +PREHOOK: Input: default@test_orc_ppd +#### A masked pattern was here #### +POSTHOOK: query: explain vectorization expression +select data_release, count(*) from test_orc_ppd where NOT (data_release BETWEEN 20191201 AND 20200101) group by data_release +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_orc_ppd +#### A masked pattern was here #### +Explain +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: test_orc_ppd + filterExpr: data_release NOT BETWEEN 20191201L AND 20200101L (type: boolean) + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + TableScan Vectorization: + native: true + Filter Operator + Filter Vectorization: + className: VectorFilterOperator + native: true + predicateExpression: FilterLongColumnNotBetween(col 0:bigint, left 20191201, right 20200101) + predicate: data_release NOT BETWEEN 20191201L AND 20200101L (type: boolean) + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + Group By Vectorization: + aggregators: VectorUDAFCountStar(*) -> bigint + className: VectorGroupByOperator + groupByMode: HASH + keyExpressions: col 0:bigint + native: false + vectorProcessingMode: HASH + projectedOutputColumnNums: [0] + keys: data_release (type: bigint) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: bigint) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Reduce Sink Vectorization: + className: VectorReduceSinkLongOperator + native: true + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reducer 2 + Execution mode: vectorized, llap + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + Group By Vectorization: + aggregators: VectorUDAFCountMerge(col 1:bigint) -> bigint + className: VectorGroupByOperator + groupByMode: MERGEPARTIAL + keyExpressions: col 0:bigint + native: false + vectorProcessingMode: MERGE_PARTIAL + projectedOutputColumnNums: [0] + keys: KEY._col0 (type: bigint) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select data_release, count(*) from test_orc_ppd where NOT (data_release BETWEEN 20191201 AND 20200101) group by data_release +PREHOOK: type: QUERY +PREHOOK: Input: default@test_orc_ppd +#### A masked pattern was here #### +POSTHOOK: query: select data_release, count(*) from test_orc_ppd where NOT (data_release BETWEEN 20191201 AND 20200101) group by data_release +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_orc_ppd +#### A masked pattern was here #### +data_release _c1 +20190301 1024