HIVE-17275: Auto-merge fails on writes of UNION ALL output to ORC file with dynamic partitioning (Chris Drome, reviewed by Mithun Radhakrishnan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/82370d16 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/82370d16 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/82370d16 Branch: refs/heads/hive-14535 Commit: 82370d1670ed51a416ee0c94abae0079f88db8d4 Parents: bc0a2f1 Author: Mithun RK <mit...@apache.org> Authored: Tue Sep 12 15:22:51 2017 -0700 Committer: Mithun RK <mit...@apache.org> Committed: Tue Sep 12 15:23:04 2017 -0700 ---------------------------------------------------------------------- .../test/resources/testconfiguration.properties | 1 + .../hive/ql/exec/AbstractFileMergeOperator.java | 14 +- .../hadoop/hive/ql/parse/GenTezUtils.java | 3 +- .../tez_union_dynamic_partition_2.q | 27 +++ .../llap/tez_union_dynamic_partition_2.q.out | 219 +++++++++++++++++++ 5 files changed, 260 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/82370d16/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index d9e760f..d472bb3 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -454,6 +454,7 @@ minillap.query.files=acid_bucket_pruning.q,\ global_limit.q,\ dynamic_partition_pruning_2.q,\ tez_union_dynamic_partition.q,\ + tez_union_dynamic_partition_2.q,\ load_fs2.q,\ llap_stats.q,\ multi_count_distinct_null.q http://git-wip-us.apache.org/repos/asf/hive/blob/82370d16/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractFileMergeOperator.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractFileMergeOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractFileMergeOperator.java index 71fb11f..6bba057 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractFileMergeOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractFileMergeOperator.java @@ -43,6 +43,7 @@ public abstract class AbstractFileMergeOperator<T extends FileMergeDesc> extends Operator<T> implements Serializable { public static final String BACKUP_PREFIX = "_backup."; + public static final String UNION_SUDBIR_PREFIX = "HIVE_UNION_SUBDIR_"; public static final Logger LOG = LoggerFactory.getLogger(AbstractFileMergeOperator.class); protected JobConf jc; @@ -193,12 +194,19 @@ public abstract class AbstractFileMergeOperator<T extends FileMergeDesc> } } else { if (hasDynamicPartitions || (listBucketingDepth > 0)) { + // In light of results from union queries, we need to be aware that + // sub-directories can exist in the partition directory. We want to + // ignore these sub-directories and promote merged files to the + // partition directory. + String name = path.getName(); + Path realPartitionPath = name.startsWith(UNION_SUDBIR_PREFIX) ? path.getParent() : path; + if (tmpPathFixed) { - checkPartitionsMatch(path); + checkPartitionsMatch(realPartitionPath); } else { // We haven't fixed the TMP path for this mapper yet - int depthDiff = path.depth() - tmpPath.depth(); - fixTmpPath(path, depthDiff); + int depthDiff = realPartitionPath.depth() - tmpPath.depth(); + fixTmpPath(realPartitionPath, depthDiff); tmpPathFixed = true; } } http://git-wip-us.apache.org/repos/asf/hive/blob/82370d16/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java index 1b0a2f0..40d4fad 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java @@ -25,6 +25,7 @@ import java.util.*; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.AbstractFileMergeOperator; import org.apache.hadoop.hive.ql.exec.AppMasterEventOperator; import org.apache.hadoop.hive.ql.exec.FetchTask; import org.apache.hadoop.hive.ql.exec.FileSinkOperator; @@ -314,7 +315,7 @@ public class GenTezUtils { linked = context.linkedFileSinks.get(path); linked.add(desc); - desc.setDirName(new Path(path, "" + linked.size())); + desc.setDirName(new Path(path, AbstractFileMergeOperator.UNION_SUDBIR_PREFIX + linked.size())); desc.setLinkedFileSink(true); desc.setParentDir(path); desc.setLinkedFileSinkDesc(linked); http://git-wip-us.apache.org/repos/asf/hive/blob/82370d16/ql/src/test/queries/clientpositive/tez_union_dynamic_partition_2.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/tez_union_dynamic_partition_2.q b/ql/src/test/queries/clientpositive/tez_union_dynamic_partition_2.q new file mode 100644 index 0000000..e8cfb3a --- /dev/null +++ b/ql/src/test/queries/clientpositive/tez_union_dynamic_partition_2.q @@ -0,0 +1,27 @@ +drop table if exists dummy; +drop table if exists partunion1; + +create table dummy(i int); +insert into table dummy values (1); +select * from dummy; + +create table partunion1(id1 int) partitioned by (part1 string) stored as orc; + +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.merge.tezfiles=true; + +explain insert into table partunion1 partition(part1) +select temps.* from ( +select 1 as id1, '2014' as part1 from dummy +union all +select 2 as id1, '2014' as part1 from dummy ) temps; + +insert into table partunion1 partition(part1) +select 1 as id1, '2014' as part1 from dummy +union all +select 2 as id1, '2014' as part1 from dummy; + +select * from partunion1; + +drop table dummy; +drop table partunion1; http://git-wip-us.apache.org/repos/asf/hive/blob/82370d16/ql/src/test/results/clientpositive/llap/tez_union_dynamic_partition_2.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/tez_union_dynamic_partition_2.q.out b/ql/src/test/results/clientpositive/llap/tez_union_dynamic_partition_2.q.out new file mode 100644 index 0000000..78a17df --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/tez_union_dynamic_partition_2.q.out @@ -0,0 +1,219 @@ +PREHOOK: query: drop table if exists dummy +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists dummy +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists partunion1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists partunion1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table dummy(i int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@dummy +POSTHOOK: query: create table dummy(i int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@dummy +PREHOOK: query: insert into table dummy values (1) +PREHOOK: type: QUERY +PREHOOK: Output: default@dummy +POSTHOOK: query: insert into table dummy values (1) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@dummy +POSTHOOK: Lineage: dummy.i EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: select * from dummy +PREHOOK: type: QUERY +PREHOOK: Input: default@dummy +#### A masked pattern was here #### +POSTHOOK: query: select * from dummy +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dummy +#### A masked pattern was here #### +1 +PREHOOK: query: create table partunion1(id1 int) partitioned by (part1 string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@partunion1 +POSTHOOK: query: create table partunion1(id1 int) partitioned by (part1 string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@partunion1 +PREHOOK: query: explain insert into table partunion1 partition(part1) +select temps.* from ( +select 1 as id1, '2014' as part1 from dummy +union all +select 2 as id1, '2014' as part1 from dummy ) temps +PREHOOK: type: QUERY +POSTHOOK: query: explain insert into table partunion1 partition(part1) +select temps.* from ( +select 1 as id1, '2014' as part1 from dummy +union all +select 2 as id1, '2014' as part1 from dummy ) temps +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-8 depends on stages: Stage-1 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-2 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Union 2 (CONTAINS) + Map 3 <- Union 2 (CONTAINS) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: dummy + Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), '2014' (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.partunion1 + Execution mode: llap + LLAP IO: no inputs + Map 3 + Map Operator Tree: + TableScan + alias: dummy + Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 2 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), '2014' (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.partunion1 + Execution mode: llap + LLAP IO: no inputs + Union 2 + Vertex: Union 2 + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + partition: + part1 + replace: false + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.partunion1 + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-4 + Tez +#### A masked pattern was here #### + Vertices: + File Merge + Merge File Operator + Map Operator Tree: + ORC File Merge Operator + merge level: stripe + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + + Stage: Stage-6 + Tez +#### A masked pattern was here #### + Vertices: + File Merge + Merge File Operator + Map Operator Tree: + ORC File Merge Operator + merge level: stripe + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + +PREHOOK: query: insert into table partunion1 partition(part1) +select 1 as id1, '2014' as part1 from dummy +union all +select 2 as id1, '2014' as part1 from dummy +PREHOOK: type: QUERY +PREHOOK: Input: default@dummy +PREHOOK: Output: default@partunion1 +POSTHOOK: query: insert into table partunion1 partition(part1) +select 1 as id1, '2014' as part1 from dummy +union all +select 2 as id1, '2014' as part1 from dummy +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dummy +POSTHOOK: Output: default@partunion1@part1=2014 +POSTHOOK: Lineage: partunion1 PARTITION(part1=2014).id1 EXPRESSION [] +PREHOOK: query: select * from partunion1 +PREHOOK: type: QUERY +PREHOOK: Input: default@partunion1 +PREHOOK: Input: default@partunion1@part1=2014 +#### A masked pattern was here #### +POSTHOOK: query: select * from partunion1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@partunion1 +POSTHOOK: Input: default@partunion1@part1=2014 +#### A masked pattern was here #### +1 2014 +2 2014 +PREHOOK: query: drop table dummy +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@dummy +PREHOOK: Output: default@dummy +POSTHOOK: query: drop table dummy +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@dummy +POSTHOOK: Output: default@dummy +PREHOOK: query: drop table partunion1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@partunion1 +PREHOOK: Output: default@partunion1 +POSTHOOK: query: drop table partunion1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@partunion1 +POSTHOOK: Output: default@partunion1