This is an automated email from the ASF dual-hosted git repository. zabetak pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
commit 4cbd9579c6cb30a3e0dbe6f86464202d6b935b78 Author: Steve Carlin <scar...@cloudera.com> AuthorDate: Sun Oct 8 16:50:03 2023 -0700 HIVE-27777: CBO fails on multi insert overwrites with common group expression (Steve Carlin reviewed by Stamatis Zampetakis) The following statement is failing at compilation time when CBO is enabled. FROM (select key, f1 FROM tbl1 where key=5) a INSERT OVERWRITE TABLE tbl2 partition(key=5) select f1 WHERE key > 0 GROUP by f1 INSERT OVERWRITE TABLE tbl2 partition(key=6) select f1 WHERE key > 0 GROUP by f1; The failure happens when there is a filter to a constant value in the FROM clause, the value is referenced in the filter in the INSERT OVERWRITE, and there is a common group existing across the insert overwrites. CBO is pulling up the key = 5 expression into the select clause as a constant (i.e. select 5 key, f1 FROM tbl1 where key = 5). After it gets converted back into AST and then re-compiled, there is code in the common group method that expects all columns to be non-constants which is causing the failiure. Close apache/hive#4783 --- .../hadoop/hive/ql/parse/SemanticAnalyzer.java | 2 +- .../hadoop/hive/ql/plan/ExprNodeConstantDesc.java | 9 + .../queries/clientpositive/multi_insert_gby5.q | 6 + .../clientpositive/llap/multi_insert_gby5.q.out | 207 +++++++++++++++++++++ 4 files changed, 223 insertions(+), 1 deletion(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 5be52904b5f..250f7c2fcbc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -6097,7 +6097,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { for (Map.Entry<ASTNode, ExprNodeDesc> entry : nodeOutputs.entrySet()) { ASTNode parameter = entry.getKey(); ExprNodeDesc expression = entry.getValue(); - if (!(expression instanceof ExprNodeColumnDesc)) { + if (!(expression instanceof ExprNodeColumnDesc) && !ExprNodeConstantDesc.isFoldedFromCol(expression)) { continue; } if (ExprNodeDescUtils.indexOf(expression, reduceValues) >= 0) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeConstantDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeConstantDesc.java index 268aa1a2faa..f5e3828e2cd 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeConstantDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeConstantDesc.java @@ -22,6 +22,7 @@ import java.io.Serializable; import java.util.List; import org.apache.commons.lang3.builder.HashCodeBuilder; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; @@ -213,6 +214,14 @@ public class ExprNodeConstantDesc extends ExprNodeDesc implements Serializable { return true; } + public static boolean isFoldedFromCol(ExprNodeDesc expr) { + if (!(expr instanceof ExprNodeConstantDesc)) { + return false; + } + ExprNodeConstantDesc constantExpr = (ExprNodeConstantDesc) expr; + return StringUtils.isNotEmpty(constantExpr.foldedFromCol); + } + @Override public int hashCode() { int superHashCode = super.hashCode(); diff --git a/ql/src/test/queries/clientpositive/multi_insert_gby5.q b/ql/src/test/queries/clientpositive/multi_insert_gby5.q new file mode 100644 index 00000000000..be856d7669b --- /dev/null +++ b/ql/src/test/queries/clientpositive/multi_insert_gby5.q @@ -0,0 +1,6 @@ +CREATE TABLE target1 (tc int); +CREATE TABLE target2 (tc int); + +EXPLAIN FROM (SELECT 100 as sa, 200 as sb) source +INSERT OVERWRITE TABLE target1 SELECT sa WHERE sb > 0 GROUP BY sa +INSERT OVERWRITE TABLE target2 SELECT sa GROUP BY sa; diff --git a/ql/src/test/results/clientpositive/llap/multi_insert_gby5.q.out b/ql/src/test/results/clientpositive/llap/multi_insert_gby5.q.out new file mode 100644 index 00000000000..1345395e0ee --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/multi_insert_gby5.q.out @@ -0,0 +1,207 @@ +PREHOOK: query: CREATE TABLE target1 (tc int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@target1 +POSTHOOK: query: CREATE TABLE target1 (tc int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@target1 +PREHOOK: query: CREATE TABLE target2 (tc int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@target2 +POSTHOOK: query: CREATE TABLE target2 (tc int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@target2 +PREHOOK: query: EXPLAIN FROM (SELECT 100 as sa, 200 as sb) source +INSERT OVERWRITE TABLE target1 SELECT sa WHERE sb > 0 GROUP BY sa +INSERT OVERWRITE TABLE target2 SELECT sa GROUP BY sa +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@target1 +PREHOOK: Output: default@target2 +POSTHOOK: query: EXPLAIN FROM (SELECT 100 as sa, 200 as sb) source +INSERT OVERWRITE TABLE target1 SELECT sa WHERE sb > 0 GROUP BY sa +INSERT OVERWRITE TABLE target2 SELECT sa GROUP BY sa +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@target1 +POSTHOOK: Output: default@target2 +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-3 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-3 + Stage-4 depends on stages: Stage-0 + Stage-1 depends on stages: Stage-3 + Stage-5 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 4 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: _dummy_table + Row Limit Per Split: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: 100 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: 100 (type: int) + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: 200 (type: int) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Forward + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (VALUE._col0 > 0) (type: boolean) + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: KEY._col0 (type: int) + mode: complete + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.target1 + Select Operator + expressions: _col0 (type: int) + outputColumnNames: tc + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: min(tc), max(tc), count(1), count(tc), compute_bit_vector_hll(tc) + minReductionHashAggr: 0.4 + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary) + Group By Operator + keys: KEY._col0 (type: int) + mode: complete + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.target2 + Select Operator + expressions: _col0 (type: int) + outputColumnNames: tc + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: min(tc), max(tc), count(1), count(tc), compute_bit_vector_hll(tc) + minReductionHashAggr: 0.4 + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 'LONG' (type: string), UDFToLong(_col0) (type: bigint), UDFToLong(_col1) (type: bigint), (_col2 - _col3) (type: bigint), COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 1 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 'LONG' (type: string), UDFToLong(_col0) (type: bigint), UDFToLong(_col1) (type: bigint), (_col2 - _col3) (type: bigint), COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 1 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-3 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.target1 + + Stage: Stage-4 + Stats Work + Basic Stats Work: + Column Stats Desc: + Columns: tc + Column Types: int + Table: default.target1 + + Stage: Stage-1 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.target2 + + Stage: Stage-5 + Stats Work + Basic Stats Work: + Column Stats Desc: + Columns: tc + Column Types: int + Table: default.target2 +