HIVE-17275: Auto-merge fails on writes of UNION ALL output to ORC file with 
dynamic partitioning (Chris Drome, reviewed by Mithun Radhakrishnan)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/82370d16
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/82370d16
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/82370d16

Branch: refs/heads/hive-14535
Commit: 82370d1670ed51a416ee0c94abae0079f88db8d4
Parents: bc0a2f1
Author: Mithun RK <mit...@apache.org>
Authored: Tue Sep 12 15:22:51 2017 -0700
Committer: Mithun RK <mit...@apache.org>
Committed: Tue Sep 12 15:23:04 2017 -0700

----------------------------------------------------------------------
 .../test/resources/testconfiguration.properties |   1 +
 .../hive/ql/exec/AbstractFileMergeOperator.java |  14 +-
 .../hadoop/hive/ql/parse/GenTezUtils.java       |   3 +-
 .../tez_union_dynamic_partition_2.q             |  27 +++
 .../llap/tez_union_dynamic_partition_2.q.out    | 219 +++++++++++++++++++
 5 files changed, 260 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/82370d16/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties 
b/itests/src/test/resources/testconfiguration.properties
index d9e760f..d472bb3 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -454,6 +454,7 @@ minillap.query.files=acid_bucket_pruning.q,\
   global_limit.q,\
   dynamic_partition_pruning_2.q,\
   tez_union_dynamic_partition.q,\
+  tez_union_dynamic_partition_2.q,\
   load_fs2.q,\
   llap_stats.q,\
   multi_count_distinct_null.q

http://git-wip-us.apache.org/repos/asf/hive/blob/82370d16/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractFileMergeOperator.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractFileMergeOperator.java 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractFileMergeOperator.java
index 71fb11f..6bba057 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractFileMergeOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractFileMergeOperator.java
@@ -43,6 +43,7 @@ public abstract class AbstractFileMergeOperator<T extends 
FileMergeDesc>
     extends Operator<T> implements Serializable {
 
   public static final String BACKUP_PREFIX = "_backup.";
+  public static final String UNION_SUDBIR_PREFIX = "HIVE_UNION_SUBDIR_";
   public static final Logger LOG = 
LoggerFactory.getLogger(AbstractFileMergeOperator.class);
 
   protected JobConf jc;
@@ -193,12 +194,19 @@ public abstract class AbstractFileMergeOperator<T extends 
FileMergeDesc>
       }
     } else {
       if (hasDynamicPartitions || (listBucketingDepth > 0)) {
+        // In light of results from union queries, we need to be aware that
+        // sub-directories can exist in the partition directory. We want to
+        // ignore these sub-directories and promote merged files to the
+        // partition directory.
+        String name = path.getName();
+        Path realPartitionPath = name.startsWith(UNION_SUDBIR_PREFIX) ? 
path.getParent() : path;
+
         if (tmpPathFixed) {
-          checkPartitionsMatch(path);
+          checkPartitionsMatch(realPartitionPath);
         } else {
           // We haven't fixed the TMP path for this mapper yet
-          int depthDiff = path.depth() - tmpPath.depth();
-          fixTmpPath(path, depthDiff);
+          int depthDiff = realPartitionPath.depth() - tmpPath.depth();
+          fixTmpPath(realPartitionPath, depthDiff);
           tmpPathFixed = true;
         }
       }

http://git-wip-us.apache.org/repos/asf/hive/blob/82370d16/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
index 1b0a2f0..40d4fad 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
@@ -25,6 +25,7 @@ import java.util.*;
 
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.AbstractFileMergeOperator;
 import org.apache.hadoop.hive.ql.exec.AppMasterEventOperator;
 import org.apache.hadoop.hive.ql.exec.FetchTask;
 import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
@@ -314,7 +315,7 @@ public class GenTezUtils {
         linked = context.linkedFileSinks.get(path);
         linked.add(desc);
 
-        desc.setDirName(new Path(path, "" + linked.size()));
+        desc.setDirName(new Path(path, 
AbstractFileMergeOperator.UNION_SUDBIR_PREFIX + linked.size()));
         desc.setLinkedFileSink(true);
         desc.setParentDir(path);
         desc.setLinkedFileSinkDesc(linked);

http://git-wip-us.apache.org/repos/asf/hive/blob/82370d16/ql/src/test/queries/clientpositive/tez_union_dynamic_partition_2.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/tez_union_dynamic_partition_2.q 
b/ql/src/test/queries/clientpositive/tez_union_dynamic_partition_2.q
new file mode 100644
index 0000000..e8cfb3a
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/tez_union_dynamic_partition_2.q
@@ -0,0 +1,27 @@
+drop table if exists dummy;
+drop table if exists partunion1;
+ 
+create table dummy(i int);
+insert into table dummy values (1);
+select * from dummy;
+
+create table partunion1(id1 int) partitioned by (part1 string) stored as orc;
+
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.merge.tezfiles=true;
+
+explain insert into table partunion1 partition(part1)
+select temps.* from (
+select 1 as id1, '2014' as part1 from dummy 
+union all 
+select 2 as id1, '2014' as part1 from dummy ) temps;
+
+insert into table partunion1 partition(part1)
+select 1 as id1, '2014' as part1 from dummy 
+union all 
+select 2 as id1, '2014' as part1 from dummy;
+
+select * from partunion1;
+
+drop table dummy;
+drop table partunion1;

http://git-wip-us.apache.org/repos/asf/hive/blob/82370d16/ql/src/test/results/clientpositive/llap/tez_union_dynamic_partition_2.q.out
----------------------------------------------------------------------
diff --git 
a/ql/src/test/results/clientpositive/llap/tez_union_dynamic_partition_2.q.out 
b/ql/src/test/results/clientpositive/llap/tez_union_dynamic_partition_2.q.out
new file mode 100644
index 0000000..78a17df
--- /dev/null
+++ 
b/ql/src/test/results/clientpositive/llap/tez_union_dynamic_partition_2.q.out
@@ -0,0 +1,219 @@
+PREHOOK: query: drop table if exists dummy
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists dummy
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: drop table if exists partunion1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists partunion1
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table dummy(i int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@dummy
+POSTHOOK: query: create table dummy(i int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@dummy
+PREHOOK: query: insert into table dummy values (1)
+PREHOOK: type: QUERY
+PREHOOK: Output: default@dummy
+POSTHOOK: query: insert into table dummy values (1)
+POSTHOOK: type: QUERY
+POSTHOOK: Output: default@dummy
+POSTHOOK: Lineage: dummy.i EXPRESSION 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, 
type:string, comment:), ]
+PREHOOK: query: select * from dummy
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dummy
+#### A masked pattern was here ####
+POSTHOOK: query: select * from dummy
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dummy
+#### A masked pattern was here ####
+1
+PREHOOK: query: create table partunion1(id1 int) partitioned by (part1 string) 
stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@partunion1
+POSTHOOK: query: create table partunion1(id1 int) partitioned by (part1 
string) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@partunion1
+PREHOOK: query: explain insert into table partunion1 partition(part1)
+select temps.* from (
+select 1 as id1, '2014' as part1 from dummy 
+union all 
+select 2 as id1, '2014' as part1 from dummy ) temps
+PREHOOK: type: QUERY
+POSTHOOK: query: explain insert into table partunion1 partition(part1)
+select temps.* from (
+select 1 as id1, '2014' as part1 from dummy 
+union all 
+select 2 as id1, '2014' as part1 from dummy ) temps
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-8 depends on stages: Stage-1 , consists of Stage-5, Stage-4, Stage-6
+  Stage-5
+  Stage-2 depends on stages: Stage-5, Stage-4, Stage-7
+  Stage-0 depends on stages: Stage-2
+  Stage-3 depends on stages: Stage-0
+  Stage-4
+  Stage-6
+  Stage-7 depends on stages: Stage-6
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 1 <- Union 2 (CONTAINS)
+        Map 3 <- Union 2 (CONTAINS)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: dummy
+                  Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: 1 (type: int)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE 
Column stats: COMPLETE
+                    Select Operator
+                      expressions: _col0 (type: int), '2014' (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 2 Data size: 184 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      File Output Operator
+                        compressed: false
+                        Statistics: Num rows: 2 Data size: 184 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        table:
+                            input format: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                            output format: 
org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+                            serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+                            name: default.partunion1
+            Execution mode: llap
+            LLAP IO: no inputs
+        Map 3 
+            Map Operator Tree:
+                TableScan
+                  alias: dummy
+                  Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: 2 (type: int)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE 
Column stats: COMPLETE
+                    Select Operator
+                      expressions: _col0 (type: int), '2014' (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 2 Data size: 184 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      File Output Operator
+                        compressed: false
+                        Statistics: Num rows: 2 Data size: 184 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        table:
+                            input format: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                            output format: 
org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+                            serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+                            name: default.partunion1
+            Execution mode: llap
+            LLAP IO: no inputs
+        Union 2 
+            Vertex: Union 2
+
+  Stage: Stage-8
+    Conditional Operator
+
+  Stage: Stage-5
+    Move Operator
+      files:
+          hdfs directory: true
+#### A masked pattern was here ####
+
+  Stage: Stage-2
+    Dependency Collection
+
+  Stage: Stage-0
+    Move Operator
+      tables:
+          partition:
+            part1 
+          replace: false
+          table:
+              input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+              output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+              serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+              name: default.partunion1
+
+  Stage: Stage-3
+    Stats-Aggr Operator
+
+  Stage: Stage-4
+    Tez
+#### A masked pattern was here ####
+      Vertices:
+        File Merge 
+          Merge File Operator
+            Map Operator Tree:
+                ORC File Merge Operator
+            merge level: stripe
+            input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+
+  Stage: Stage-6
+    Tez
+#### A masked pattern was here ####
+      Vertices:
+        File Merge 
+          Merge File Operator
+            Map Operator Tree:
+                ORC File Merge Operator
+            merge level: stripe
+            input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+
+  Stage: Stage-7
+    Move Operator
+      files:
+          hdfs directory: true
+#### A masked pattern was here ####
+
+PREHOOK: query: insert into table partunion1 partition(part1)
+select 1 as id1, '2014' as part1 from dummy 
+union all 
+select 2 as id1, '2014' as part1 from dummy
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dummy
+PREHOOK: Output: default@partunion1
+POSTHOOK: query: insert into table partunion1 partition(part1)
+select 1 as id1, '2014' as part1 from dummy 
+union all 
+select 2 as id1, '2014' as part1 from dummy
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dummy
+POSTHOOK: Output: default@partunion1@part1=2014
+POSTHOOK: Lineage: partunion1 PARTITION(part1=2014).id1 EXPRESSION []
+PREHOOK: query: select * from partunion1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@partunion1
+PREHOOK: Input: default@partunion1@part1=2014
+#### A masked pattern was here ####
+POSTHOOK: query: select * from partunion1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@partunion1
+POSTHOOK: Input: default@partunion1@part1=2014
+#### A masked pattern was here ####
+1      2014
+2      2014
+PREHOOK: query: drop table dummy
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@dummy
+PREHOOK: Output: default@dummy
+POSTHOOK: query: drop table dummy
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@dummy
+POSTHOOK: Output: default@dummy
+PREHOOK: query: drop table partunion1
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@partunion1
+PREHOOK: Output: default@partunion1
+POSTHOOK: query: drop table partunion1
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@partunion1
+POSTHOOK: Output: default@partunion1

Reply via email to