This is an automated email from the ASF dual-hosted git repository. sankarh pushed a commit to branch branch-3 in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/branch-3 by this push: new 2058c2e0dee HIVE-27573: Backport of HIVE-21799: NullPointerException in DynamicPartitionPruningOptimization, when join key is on aggregation column to branch-3 2058c2e0dee is described below commit 2058c2e0dee430f0a8bc9f4a3cadd75d5e087091 Author: Shefali Singh <31477542+shefali...@users.noreply.github.com> AuthorDate: Thu Sep 28 17:13:31 2023 +0530 HIVE-27573: Backport of HIVE-21799: NullPointerException in DynamicPartitionPruningOptimization, when join key is on aggregation column to branch-3 Signed-off-by: Sankar Hariappan <sank...@apache.org> Closes (#4556) --- .../test/resources/testconfiguration.properties | 1 + .../DynamicPartitionPruningOptimization.java | 13 +- .../dynamic_semijoin_reduction_on_aggcol.q | 17 +++ .../dynamic_semijoin_reduction_on_aggcol.q.out | 149 +++++++++++++++++++++ 4 files changed, 171 insertions(+), 9 deletions(-) diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index a5bce33d74f..144a5a8ad48 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -518,6 +518,7 @@ minillaplocal.query.files=\ dynamic_semijoin_reduction_2.q,\ dynamic_semijoin_reduction_3.q,\ dynamic_semijoin_reduction_4.q,\ + dynamic_semijoin_reduction_on_aggcol.q,\ dynamic_semijoin_reduction_sw.q,\ dynpart_sort_opt_vectorization.q,\ dynpart_sort_optimization.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java index a1401aac72c..d84f10b4c38 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java @@ -576,16 +576,11 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor { // Create the column expr map Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>(); ExprNodeDesc exprNode = null; - if ( parentOfRS.getColumnExprMap() != null) { - exprNode = parentOfRS.getColumnExprMap().get(internalColName).clone(); - } else { - exprNode = new ExprNodeColumnDesc(columnInfo); - } - - if (exprNode instanceof ExprNodeColumnDesc) { - ExprNodeColumnDesc encd = (ExprNodeColumnDesc) exprNode; - encd.setColumn(internalColName); + if (columnInfo == null) { + LOG.debug("No ColumnInfo found in {} for {}", parentOfRS.getOperatorId(), internalColName); + return false; } + exprNode = new ExprNodeColumnDesc(columnInfo); colExprMap.put(internalColName, exprNode); // Create the Select Operator diff --git a/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_on_aggcol.q b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_on_aggcol.q new file mode 100644 index 00000000000..e7c8db3e778 --- /dev/null +++ b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_on_aggcol.q @@ -0,0 +1,17 @@ +--! qt:dataset:src +set hive.explain.user=false; +set hive.tez.dynamic.partition.pruning=true; +set hive.tez.dynamic.semijoin.reduction=true; +set hive.tez.bigtable.minsize.semijoin.reduction=1; +set hive.tez.min.bloom.filter.entries=1; + +create table dynamic_semijoin_reduction_on_aggcol(id int, outcome string, eventid int) stored as orc; +insert into dynamic_semijoin_reduction_on_aggcol select key, value, key from src; + +explain select a.id, b.outcome from (select id, max(eventid) as event_id_max from dynamic_semijoin_reduction_on_aggcol where id = 0 group by id) a +LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b +on a.event_id_max = b.eventid; + +select a.id, b.outcome from (select id, max(eventid) as event_id_max from dynamic_semijoin_reduction_on_aggcol where id = 0 group by id) a +LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b +on a.event_id_max = b.eventid; diff --git a/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction_on_aggcol.q.out b/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction_on_aggcol.q.out new file mode 100644 index 00000000000..4d29456df26 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction_on_aggcol.q.out @@ -0,0 +1,149 @@ +PREHOOK: query: create table dynamic_semijoin_reduction_on_aggcol(id int, outcome string, eventid int) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@dynamic_semijoin_reduction_on_aggcol +POSTHOOK: query: create table dynamic_semijoin_reduction_on_aggcol(id int, outcome string, eventid int) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dynamic_semijoin_reduction_on_aggcol +PREHOOK: query: insert into dynamic_semijoin_reduction_on_aggcol select key, value, key from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@dynamic_semijoin_reduction_on_aggcol +POSTHOOK: query: insert into dynamic_semijoin_reduction_on_aggcol select key, value, key from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@dynamic_semijoin_reduction_on_aggcol +POSTHOOK: Lineage: dynamic_semijoin_reduction_on_aggcol.eventid EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dynamic_semijoin_reduction_on_aggcol.id EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: dynamic_semijoin_reduction_on_aggcol.outcome SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: explain select a.id, b.outcome from (select id, max(eventid) as event_id_max from dynamic_semijoin_reduction_on_aggcol where id = 0 group by id) a +LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b +on a.event_id_max = b.eventid +PREHOOK: type: QUERY +PREHOOK: Input: default@dynamic_semijoin_reduction_on_aggcol +#### A masked pattern was here #### +POSTHOOK: query: explain select a.id, b.outcome from (select id, max(eventid) as event_id_max from dynamic_semijoin_reduction_on_aggcol where id = 0 group by id) a +LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b +on a.event_id_max = b.eventid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dynamic_semijoin_reduction_on_aggcol +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Map 4 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: dynamic_semijoin_reduction_on_aggcol + Statistics: Num rows: 500 Data size: 4000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (id = 0) (type: boolean) + Statistics: Num rows: 2 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: eventid (type: int) + outputColumnNames: _col1 + Statistics: Num rows: 2 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(_col1) + keys: 0 (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: int) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: outcome (type: string), eventid (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Map-reduce partition columns: _col1 (type: int) + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Left Outer Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col1 (type: int) + outputColumnNames: _col1 + Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 0 (type: int), _col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 95 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 95 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a.id, b.outcome from (select id, max(eventid) as event_id_max from dynamic_semijoin_reduction_on_aggcol where id = 0 group by id) a +LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b +on a.event_id_max = b.eventid +PREHOOK: type: QUERY +PREHOOK: Input: default@dynamic_semijoin_reduction_on_aggcol +#### A masked pattern was here #### +POSTHOOK: query: select a.id, b.outcome from (select id, max(eventid) as event_id_max from dynamic_semijoin_reduction_on_aggcol where id = 0 group by id) a +LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b +on a.event_id_max = b.eventid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dynamic_semijoin_reduction_on_aggcol +#### A masked pattern was here #### +0 val_0 +0 val_0 +0 val_0