[hive] 01/02: HIVE-27777: CBO fails on multi insert overwrites with common group expression (Steve Carlin reviewed by Stamatis Zampetakis)

zabetak Thu, 26 Oct 2023 06:00:26 -0700

This is an automated email from the ASF dual-hosted git repository.

zabetak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


commit 4cbd9579c6cb30a3e0dbe6f86464202d6b935b78
Author: Steve Carlin <scar...@cloudera.com>
AuthorDate: Sun Oct 8 16:50:03 2023 -0700

    HIVE-27777: CBO fails on multi insert overwrites with common group 
expression (Steve Carlin reviewed by Stamatis Zampetakis)
    
    The following statement is failing at compilation time when CBO is
    enabled.
    
    FROM (select key, f1 FROM tbl1 where key=5) a
    INSERT OVERWRITE TABLE tbl2 partition(key=5)
    select f1 WHERE key > 0 GROUP by f1
    INSERT OVERWRITE TABLE tbl2 partition(key=6)
    select f1 WHERE key > 0 GROUP by f1;
    
    The failure happens when there is a filter to a constant value in
    the FROM clause, the value is referenced in the filter in the INSERT
    OVERWRITE, and there is a common group existing across the insert
    overwrites.
    
    CBO is pulling up the key = 5 expression into the select clause as a
    constant (i.e. select 5 key, f1 FROM tbl1 where key = 5).  After it
    gets converted back into AST and then re-compiled, there is code in
    the common group method that expects all columns to be non-constants
    which is causing the failiure.
    
    Close apache/hive#4783
---
 .../hadoop/hive/ql/parse/SemanticAnalyzer.java     |   2 +-
 .../hadoop/hive/ql/plan/ExprNodeConstantDesc.java  |   9 +
 .../queries/clientpositive/multi_insert_gby5.q     |   6 +
 .../clientpositive/llap/multi_insert_gby5.q.out    | 207 +++++++++++++++++++++
 4 files changed, 223 insertions(+), 1 deletion(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
index 5be52904b5f..250f7c2fcbc 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
@@ -6097,7 +6097,7 @@ public class SemanticAnalyzer extends 
BaseSemanticAnalyzer {
         for (Map.Entry<ASTNode, ExprNodeDesc> entry : nodeOutputs.entrySet()) {
           ASTNode parameter = entry.getKey();
           ExprNodeDesc expression = entry.getValue();
-          if (!(expression instanceof ExprNodeColumnDesc)) {
+          if (!(expression instanceof ExprNodeColumnDesc) && 
!ExprNodeConstantDesc.isFoldedFromCol(expression)) {
             continue;
           }
           if (ExprNodeDescUtils.indexOf(expression, reduceValues) >= 0) {
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeConstantDesc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeConstantDesc.java
index 268aa1a2faa..f5e3828e2cd 100755
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeConstantDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeConstantDesc.java
@@ -22,6 +22,7 @@ import java.io.Serializable;
 import java.util.List;
 
 import org.apache.commons.lang3.builder.HashCodeBuilder;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.hive.common.StringInternUtils;
 import org.apache.hadoop.hive.serde.serdeConstants;
 import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
@@ -213,6 +214,14 @@ public class ExprNodeConstantDesc extends ExprNodeDesc 
implements Serializable {
     return true;
   }
 
+  public static boolean isFoldedFromCol(ExprNodeDesc expr) {
+    if (!(expr instanceof ExprNodeConstantDesc)) {
+      return false;
+    }
+    ExprNodeConstantDesc constantExpr = (ExprNodeConstantDesc) expr;
+    return StringUtils.isNotEmpty(constantExpr.foldedFromCol);
+  }
+
   @Override
   public int hashCode() {
     int superHashCode = super.hashCode();
diff --git a/ql/src/test/queries/clientpositive/multi_insert_gby5.q 
b/ql/src/test/queries/clientpositive/multi_insert_gby5.q
new file mode 100644
index 00000000000..be856d7669b
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/multi_insert_gby5.q
@@ -0,0 +1,6 @@
+CREATE TABLE target1 (tc int);
+CREATE TABLE target2 (tc int);
+
+EXPLAIN FROM (SELECT 100 as sa, 200 as sb) source
+INSERT OVERWRITE TABLE target1 SELECT sa WHERE sb > 0 GROUP BY sa
+INSERT OVERWRITE TABLE target2 SELECT sa GROUP BY sa;
diff --git a/ql/src/test/results/clientpositive/llap/multi_insert_gby5.q.out 
b/ql/src/test/results/clientpositive/llap/multi_insert_gby5.q.out
new file mode 100644
index 00000000000..1345395e0ee
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/multi_insert_gby5.q.out
@@ -0,0 +1,207 @@
+PREHOOK: query: CREATE TABLE target1 (tc int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@target1
+POSTHOOK: query: CREATE TABLE target1 (tc int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@target1
+PREHOOK: query: CREATE TABLE target2 (tc int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@target2
+POSTHOOK: query: CREATE TABLE target2 (tc int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@target2
+PREHOOK: query: EXPLAIN FROM (SELECT 100 as sa, 200 as sb) source
+INSERT OVERWRITE TABLE target1 SELECT sa WHERE sb > 0 GROUP BY sa
+INSERT OVERWRITE TABLE target2 SELECT sa GROUP BY sa
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@target1
+PREHOOK: Output: default@target2
+POSTHOOK: query: EXPLAIN FROM (SELECT 100 as sa, 200 as sb) source
+INSERT OVERWRITE TABLE target1 SELECT sa WHERE sb > 0 GROUP BY sa
+INSERT OVERWRITE TABLE target2 SELECT sa GROUP BY sa
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@target1
+POSTHOOK: Output: default@target2
+STAGE DEPENDENCIES:
+  Stage-2 is a root stage
+  Stage-3 depends on stages: Stage-2
+  Stage-0 depends on stages: Stage-3
+  Stage-4 depends on stages: Stage-0
+  Stage-1 depends on stages: Stage-3
+  Stage-5 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-2
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+        Reducer 4 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: _dummy_table
+                  Row Limit Per Split: 1
+                  Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    Statistics: Num rows: 1 Data size: 10 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: 100 (type: int)
+                      null sort order: z
+                      sort order: +
+                      Map-reduce partition columns: 100 (type: int)
+                      Statistics: Num rows: 1 Data size: 10 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      value expressions: 200 (type: int)
+            Execution mode: llap
+            LLAP IO: no inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Forward
+                Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Filter Operator
+                  predicate: (VALUE._col0 > 0) (type: boolean)
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Group By Operator
+                    keys: KEY._col0 (type: int)
+                    mode: complete
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE 
Column stats: COMPLETE
+                    File Output Operator
+                      compressed: false
+                      Statistics: Num rows: 1 Data size: 4 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      table:
+                          input format: 
org.apache.hadoop.mapred.TextInputFormat
+                          output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                          serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                          name: default.target1
+                    Select Operator
+                      expressions: _col0 (type: int)
+                      outputColumnNames: tc
+                      Statistics: Num rows: 1 Data size: 4 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Group By Operator
+                        aggregations: min(tc), max(tc), count(1), count(tc), 
compute_bit_vector_hll(tc)
+                        minReductionHashAggr: 0.4
+                        mode: hash
+                        outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                        Statistics: Num rows: 1 Data size: 168 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Reduce Output Operator
+                          null sort order: 
+                          sort order: 
+                          Statistics: Num rows: 1 Data size: 168 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          value expressions: _col0 (type: int), _col1 (type: 
int), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary)
+                Group By Operator
+                  keys: KEY._col0 (type: int)
+                  mode: complete
+                  outputColumnNames: _col0
+                  Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE 
Column stats: COMPLETE
+                    table:
+                        input format: org.apache.hadoop.mapred.TextInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                        name: default.target2
+                  Select Operator
+                    expressions: _col0 (type: int)
+                    outputColumnNames: tc
+                    Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE 
Column stats: COMPLETE
+                    Group By Operator
+                      aggregations: min(tc), max(tc), count(1), count(tc), 
compute_bit_vector_hll(tc)
+                      minReductionHashAggr: 0.4
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                      Statistics: Num rows: 1 Data size: 168 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        null sort order: 
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: 168 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        value expressions: _col0 (type: int), _col1 (type: 
int), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary)
+        Reducer 3 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), 
count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                Statistics: Num rows: 1 Data size: 168 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Select Operator
+                  expressions: 'LONG' (type: string), UDFToLong(_col0) (type: 
bigint), UDFToLong(_col1) (type: bigint), (_col2 - _col3) (type: bigint), 
COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary)
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+                  Statistics: Num rows: 1 Data size: 264 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 1 Data size: 264 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), 
count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                Statistics: Num rows: 1 Data size: 168 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Select Operator
+                  expressions: 'LONG' (type: string), UDFToLong(_col0) (type: 
bigint), UDFToLong(_col1) (type: bigint), (_col2 - _col3) (type: bigint), 
COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary)
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+                  Statistics: Num rows: 1 Data size: 264 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 1 Data size: 264 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-3
+    Dependency Collection
+
+  Stage: Stage-0
+    Move Operator
+      tables:
+          replace: true
+          table:
+              input format: org.apache.hadoop.mapred.TextInputFormat
+              output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+              name: default.target1
+
+  Stage: Stage-4
+    Stats Work
+      Basic Stats Work:
+      Column Stats Desc:
+          Columns: tc
+          Column Types: int
+          Table: default.target1
+
+  Stage: Stage-1
+    Move Operator
+      tables:
+          replace: true
+          table:
+              input format: org.apache.hadoop.mapred.TextInputFormat
+              output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+              name: default.target2
+
+  Stage: Stage-5
+    Stats Work
+      Basic Stats Work:
+      Column Stats Desc:
+          Columns: tc
+          Column Types: int
+          Table: default.target2
+

[hive] 01/02: HIVE-27777: CBO fails on multi insert overwrites with common group expression (Steve Carlin reviewed by Stamatis Zampetakis)

Reply via email to