This is an automated email from the ASF dual-hosted git repository. abstractdog pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 7fa1f54 HIVE-23712: metadata-only queries return incorrect results with empty acid partition (#1182) (Laszlo Bodor reviewed by Mustafa Iman, Ashutosh Chauhan) 7fa1f54 is described below commit 7fa1f546bc90f196a3b2a6c1d9897a0a0acae686 Author: Bodor Laszlo <bodorlaszlo0...@gmail.com> AuthorDate: Wed Oct 14 10:03:51 2020 +0200 HIVE-23712: metadata-only queries return incorrect results with empty acid partition (#1182) (Laszlo Bodor reviewed by Mustafa Iman, Ashutosh Chauhan) Change-Id: Ia63c4a502f77f863f1ee20e0a407893689d79afb --- .../org/apache/hadoop/hive/ql/exec/Utilities.java | 19 +-- .../optimizer/physical/NullScanTaskDispatcher.java | 32 ++++- .../metadataonly_acid_empty_partition.q | 22 ++++ .../llap/metadataonly_acid_empty_partition.q.out | 136 +++++++++++++++++++++ 4 files changed, 198 insertions(+), 11 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index b2dde64..9b0f683 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -2780,16 +2780,21 @@ public final class Utilities { } public static boolean isEmptyPath(Configuration job, Path dirPath) throws IOException { + FileStatus[] fStats = listNonHiddenFileStatus(job, dirPath); + if (fStats.length > 0) { + return false; + } + return true; + } + + public static FileStatus[] listNonHiddenFileStatus(Configuration job, Path dirPath) + throws IOException { FileSystem inpFs = dirPath.getFileSystem(job); try { - FileStatus[] fStats = inpFs.listStatus(dirPath, FileUtils.HIDDEN_FILES_PATH_FILTER); - if (fStats.length > 0) { - return false; - } - } catch(FileNotFoundException fnf) { - return true; + return inpFs.listStatus(dirPath, FileUtils.HIDDEN_FILES_PATH_FILTER); + } catch (FileNotFoundException e) { + return new FileStatus[] {}; } - return true; } public static List<TezTask> getTezTasks(List<Task<?>> tasks) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java index 9b5a099..b46459c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java @@ -33,12 +33,14 @@ import java.util.Stack; import java.util.stream.Collectors; import org.apache.commons.collections.CollectionUtils; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.StringInternUtils; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat; import org.apache.hadoop.hive.ql.io.NullScanFileSystem; import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat; @@ -96,20 +98,42 @@ public class NullScanTaskDispatcher implements SemanticDispatcher { if (desc == null) { return null; } - boolean isEmpty = false; + FileStatus[] filesFoundInPartitionDir = null; try { - isEmpty = Utilities.isEmptyPath(physicalContext.getConf(), path); + filesFoundInPartitionDir = Utilities.listNonHiddenFileStatus(physicalContext.getConf(), path); } catch (IOException e) { LOG.error("Cannot determine if the table is empty", e); } - desc.setInputFileFormatClass( - isEmpty ? ZeroRowsInputFormat.class : OneNullRowInputFormat.class); + if (!isMetadataOnlyAllowed(filesFoundInPartitionDir)) { + return desc; + } + + boolean isEmpty = filesFoundInPartitionDir == null || filesFoundInPartitionDir.length == 0; + desc.setInputFileFormatClass(isEmpty ? ZeroRowsInputFormat.class : OneNullRowInputFormat.class); desc.setOutputFileFormatClass(HiveIgnoreKeyTextOutputFormat.class); desc.getProperties().setProperty(serdeConstants.SERIALIZATION_LIB, NullStructSerDe.class.getName()); return desc; } + private boolean isMetadataOnlyAllowed(FileStatus[] filesFoundInPartitionDir) { + if (filesFoundInPartitionDir == null || filesFoundInPartitionDir.length == 0) { + return true; // empty folders are safe to convert to metadata-only + } + for (FileStatus f : filesFoundInPartitionDir) { + if (AcidUtils.isDeleteDelta(f.getPath())) { + /* + * as described in HIVE-23712, an acid partition is not a safe subject of metadata-only + * optimization, because there is a chance that it contains no data but contains folders + * (e.g: delta_0000002_0000002_0000, delete_delta_0000003_0000003_0000), without scanning + * the underlying file contents, we cannot tell whether this partition contains data or not + */ + return false; + } + } + return true; + } + private void processAlias(MapWork work, Path path, Collection<String> aliasesAffected, Set<String> aliases) { // the aliases that are allowed to map to a null scan. diff --git a/ql/src/test/queries/clientpositive/metadataonly_acid_empty_partition.q b/ql/src/test/queries/clientpositive/metadataonly_acid_empty_partition.q new file mode 100644 index 0000000..f97814a --- /dev/null +++ b/ql/src/test/queries/clientpositive/metadataonly_acid_empty_partition.q @@ -0,0 +1,22 @@ +set hive.support.concurrency=true; +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; + +set hive.optimize.metadataonly=true; + +create table test1 (id int, val string) partitioned by (val2 string) STORED AS ORC TBLPROPERTIES ('transactional'='true'); +describe formatted test1; + +alter table test1 add partition (val2='foo'); +alter table test1 add partition (val2='bar'); +insert into test1 partition (val2='foo') values (1, 'abc'); +insert into test1 partition (val2='bar') values (1, 'def'); +delete from test1 where val2 = 'bar'; + +select '--> hive.optimize.metadataonly=true'; +select distinct val2 from test1; + + +set hive.optimize.metadataonly=false; +select '--> hive.optimize.metadataonly=false'; +select distinct val2 from test1; diff --git a/ql/src/test/results/clientpositive/llap/metadataonly_acid_empty_partition.q.out b/ql/src/test/results/clientpositive/llap/metadataonly_acid_empty_partition.q.out new file mode 100644 index 0000000..4c2c9a5 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/metadataonly_acid_empty_partition.q.out @@ -0,0 +1,136 @@ +PREHOOK: query: create table test1 (id int, val string) partitioned by (val2 string) STORED AS ORC TBLPROPERTIES ('transactional'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test1 +POSTHOOK: query: create table test1 (id int, val string) partitioned by (val2 string) STORED AS ORC TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test1 +PREHOOK: query: describe formatted test1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@test1 +POSTHOOK: query: describe formatted test1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@test1 +# col_name data_type comment +id int +val string + +# Partition Information +# col_name data_type comment +val2 string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} + bucketing_version 2 + numFiles 0 + numPartitions 0 + numRows 0 + rawDataSize 0 + totalSize 0 + transactional true + transactional_properties default +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +PREHOOK: query: alter table test1 add partition (val2='foo') +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@test1 +POSTHOOK: query: alter table test1 add partition (val2='foo') +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@test1 +POSTHOOK: Output: default@test1@val2=foo +PREHOOK: query: alter table test1 add partition (val2='bar') +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@test1 +POSTHOOK: query: alter table test1 add partition (val2='bar') +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@test1 +POSTHOOK: Output: default@test1@val2=bar +PREHOOK: query: insert into test1 partition (val2='foo') values (1, 'abc') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test1@val2=foo +POSTHOOK: query: insert into test1 partition (val2='foo') values (1, 'abc') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test1@val2=foo +POSTHOOK: Lineage: test1 PARTITION(val2=foo).id SCRIPT [] +POSTHOOK: Lineage: test1 PARTITION(val2=foo).val SCRIPT [] +PREHOOK: query: insert into test1 partition (val2='bar') values (1, 'def') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test1@val2=bar +POSTHOOK: query: insert into test1 partition (val2='bar') values (1, 'def') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test1@val2=bar +POSTHOOK: Lineage: test1 PARTITION(val2=bar).id SCRIPT [] +POSTHOOK: Lineage: test1 PARTITION(val2=bar).val SCRIPT [] +PREHOOK: query: delete from test1 where val2 = 'bar' +PREHOOK: type: QUERY +PREHOOK: Input: default@test1 +PREHOOK: Input: default@test1@val2=bar +PREHOOK: Output: default@test1@val2=bar +POSTHOOK: query: delete from test1 where val2 = 'bar' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test1 +POSTHOOK: Input: default@test1@val2=bar +POSTHOOK: Output: default@test1@val2=bar +PREHOOK: query: select '--> hive.optimize.metadataonly=true' +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: select '--> hive.optimize.metadataonly=true' +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +--> hive.optimize.metadataonly=true +PREHOOK: query: select distinct val2 from test1 +PREHOOK: type: QUERY +PREHOOK: Input: default@test1 +PREHOOK: Input: default@test1@val2=bar +PREHOOK: Input: default@test1@val2=foo +#### A masked pattern was here #### +POSTHOOK: query: select distinct val2 from test1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test1 +POSTHOOK: Input: default@test1@val2=bar +POSTHOOK: Input: default@test1@val2=foo +#### A masked pattern was here #### +foo +PREHOOK: query: select '--> hive.optimize.metadataonly=false' +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: select '--> hive.optimize.metadataonly=false' +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +--> hive.optimize.metadataonly=false +PREHOOK: query: select distinct val2 from test1 +PREHOOK: type: QUERY +PREHOOK: Input: default@test1 +PREHOOK: Input: default@test1@val2=bar +PREHOOK: Input: default@test1@val2=foo +#### A masked pattern was here #### +POSTHOOK: query: select distinct val2 from test1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test1 +POSTHOOK: Input: default@test1@val2=bar +POSTHOOK: Input: default@test1@val2=foo +#### A masked pattern was here #### +foo