Repository: hive Updated Branches: refs/heads/master e103abc3f -> 867a187bb
HIVE-21040 : msck does unnecessary file listing at last level of directory tree (Vihang Karajgaonkar, reviewed by Prasanth Jayachandran) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/867a187b Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/867a187b Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/867a187b Branch: refs/heads/master Commit: 867a187bbd0b4d9fe8b567d8eb8e5a8fcd6afa9f Parents: e103abc Author: Vihang Karajgaonkar <vihan...@apache.org> Authored: Mon Dec 17 17:13:56 2018 -0800 Committer: Vihang Karajgaonkar <vihan...@apache.org> Committed: Thu Dec 20 18:47:43 2018 -0800 ---------------------------------------------------------------------- .../ql/metadata/TestHiveMetaStoreChecker.java | 3 + .../hive/metastore/HiveMetaStoreChecker.java | 18 +-- .../hive/metastore/TestMsckCheckPartitions.java | 138 +++++++++++++++++++ 3 files changed, 151 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/867a187b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java index 434d82a..520eb1b 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java @@ -714,6 +714,9 @@ public class TestHiveMetaStoreChecker { private void createDirectory(String partPath) throws IOException { Path part = new Path(partPath); fs.mkdirs(part); + // create files under partitions to simulate real partitions + fs.createNewFile(new Path(partPath + Path.SEPARATOR + "dummydata1")); + fs.createNewFile(new Path(partPath + Path.SEPARATOR + "dummydata2")); fs.deleteOnExit(part); } } http://git-wip-us.apache.org/repos/asf/hive/blob/867a187b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java index 2df45f6..6f4400a 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java @@ -45,6 +45,7 @@ import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.ThreadFactory; +import com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -474,10 +475,13 @@ public class HiveMetaStoreChecker { throws IOException, MetastoreException { final Path currentPath = pd.p; final int currentDepth = pd.depth; + if (currentDepth == partColNames.size()) { + return currentPath; + } FileStatus[] fileStatuses = fs.listStatus(currentPath, FileUtils.HIDDEN_FILES_PATH_FILTER); // found no files under a sub-directory under table base path; it is possible that the table // is empty and hence there are no partition sub-directories created under base path - if (fileStatuses.length == 0 && currentDepth > 0 && currentDepth < partColNames.size()) { + if (fileStatuses.length == 0 && currentDepth > 0) { // since maxDepth is not yet reached, we are missing partition // columns in currentPath logOrThrowExceptionWithMsg( @@ -485,12 +489,12 @@ public class HiveMetaStoreChecker { } else { // found files under currentPath add them to the queue if it is a directory for (FileStatus fileStatus : fileStatuses) { - if (!fileStatus.isDirectory() && currentDepth < partColNames.size()) { + if (!fileStatus.isDirectory()) { // found a file at depth which is less than number of partition keys logOrThrowExceptionWithMsg( "MSCK finds a file rather than a directory when it searches for " + fileStatus.getPath().toString()); - } else if (fileStatus.isDirectory() && currentDepth < partColNames.size()) { + } else { // found a sub-directory at a depth less than number of partition keys // validate if the partition directory name matches with the corresponding // partition colName at currentDepth @@ -507,9 +511,6 @@ public class HiveMetaStoreChecker { } } } - if (currentDepth == partColNames.size()) { - return currentPath; - } } return null; } @@ -532,7 +533,8 @@ public class HiveMetaStoreChecker { } } - private void checkPartitionDirs(final ExecutorService executor, + @VisibleForTesting + void checkPartitionDirs(final ExecutorService executor, final Path basePath, final Set<Path> result, final FileSystem fs, final List<String> partColNames) throws MetastoreException { try { @@ -563,7 +565,7 @@ public class HiveMetaStoreChecker { nextLevel = tempQueue; } } catch (InterruptedException | ExecutionException e) { - LOG.error(e.getMessage()); + LOG.error("Exception received while listing partition directories", e); executor.shutdownNow(); throw new MetastoreException(e.getCause()); } http://git-wip-us.apache.org/repos/asf/hive/blob/867a187b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestMsckCheckPartitions.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestMsckCheckPartitions.java b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestMsckCheckPartitions.java new file mode 100644 index 0000000..980423e --- /dev/null +++ b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestMsckCheckPartitions.java @@ -0,0 +1,138 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest; +import org.apache.hadoop.hive.metastore.api.MetastoreException; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.utils.FileUtils; +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.mockito.Mockito; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import static org.mockito.Matchers.any; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@Category(MetastoreUnitTest.class) +public class TestMsckCheckPartitions { + + /** + * Test counts the number of listStatus calls in the msck core method of + * listing sub-directories. This is important to check since it unnecessary + * listStatus calls could cause performance degradation in remote filesystems + * like S3. The test creates a mock FileSystem object and a mock directory structure + * to simulate a table which has 2 partition keys and 2 partition values at each level. + * In the end it counts how many times the listStatus is called on the mock filesystem + * and confirm its equal to the current theoretical value. + * + * @throws IOException + * @throws MetastoreException + */ + @Test + public void testNumberOfListStatusCalls() throws IOException, MetastoreException { + LocalFileSystem mockFs = Mockito.mock(LocalFileSystem.class); + Path tableLocation = new Path("mock:///tmp/testTable"); + + Path countryUS = new Path(tableLocation, "country=US"); + Path countryIND = new Path(tableLocation, "country=IND"); + + Path cityPA = new Path(countryUS, "city=PA"); + Path citySF = new Path(countryUS, "city=SF"); + Path cityBOM = new Path(countryIND, "city=BOM"); + Path cityDEL = new Path(countryIND, "city=DEL"); + + Path paData = new Path(cityPA, "datafile"); + Path sfData = new Path(citySF, "datafile"); + Path bomData = new Path(cityBOM, "datafile"); + Path delData = new Path(cityDEL, "datafile"); + + //level 1 listing + FileStatus[] allCountries = getMockFileStatus(countryUS, countryIND); + when(mockFs.listStatus(tableLocation, FileUtils.HIDDEN_FILES_PATH_FILTER)) + .thenReturn(allCountries); + + //level 2 listing + FileStatus[] filesInUS = getMockFileStatus(cityPA, citySF); + when(mockFs.listStatus(countryUS, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(filesInUS); + + FileStatus[] filesInInd = getMockFileStatus(cityBOM, cityDEL); + when(mockFs.listStatus(countryIND, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(filesInInd); + + //level 3 listing + FileStatus[] paFiles = getMockFileStatus(paData); + when(mockFs.listStatus(cityPA, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(paFiles); + + FileStatus[] sfFiles = getMockFileStatus(sfData); + when(mockFs.listStatus(citySF, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(sfFiles); + + FileStatus[] bomFiles = getMockFileStatus(bomData); + when(mockFs.listStatus(cityBOM, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(bomFiles); + + FileStatus[] delFiles = getMockFileStatus(delData); + when(mockFs.listStatus(cityDEL, FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(delFiles); + + HiveMetaStoreChecker checker = new HiveMetaStoreChecker(Mockito.mock(IMetaStoreClient.class), + MetastoreConf.newMetastoreConf()); + ExecutorService executorService = Executors.newFixedThreadPool(2); + Set<Path> result = new HashSet<>(); + checker.checkPartitionDirs(executorService, tableLocation, result, mockFs, + Arrays.asList("country", "city")); + // if there are n partition columns, then number of times listStatus should be called + // must be equal + // to (numDirsAtLevel1) + (numDirsAtLevel2) + ... + (numDirAtLeveln-1) + // in this case it should 1 (table level) + 2 (US, IND) + verify(mockFs, times(3)).listStatus(any(Path.class), any(PathFilter.class)); + Assert.assertEquals("msck should have found 4 unknown partitions", 4, result.size()); + } + + private FileStatus[] getMockFileStatus(Path... paths) throws IOException { + FileStatus[] result = new FileStatus[paths.length]; + int i = 0; + for (Path p : paths) { + result[i++] = createMockFileStatus(p); + } + return result; + } + + private FileStatus createMockFileStatus(Path p) { + FileStatus mock = Mockito.mock(FileStatus.class); + when(mock.getPath()).thenReturn(p); + if (p.toString().contains("datafile")) { + when(mock.isDirectory()).thenReturn(false); + } else { + when(mock.isDirectory()).thenReturn(true); + } + return mock; + } +}