Repository: hive
Updated Branches:
  refs/heads/master e103abc3f -> 867a187bb


HIVE-21040 : msck does unnecessary file listing at last level of directory tree 
(Vihang Karajgaonkar, reviewed by Prasanth Jayachandran)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/867a187b
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/867a187b
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/867a187b

Branch: refs/heads/master
Commit: 867a187bbd0b4d9fe8b567d8eb8e5a8fcd6afa9f
Parents: e103abc
Author: Vihang Karajgaonkar <vihan...@apache.org>
Authored: Mon Dec 17 17:13:56 2018 -0800
Committer: Vihang Karajgaonkar <vihan...@apache.org>
Committed: Thu Dec 20 18:47:43 2018 -0800

----------------------------------------------------------------------
 .../ql/metadata/TestHiveMetaStoreChecker.java   |   3 +
 .../hive/metastore/HiveMetaStoreChecker.java    |  18 +--
 .../hive/metastore/TestMsckCheckPartitions.java | 138 +++++++++++++++++++
 3 files changed, 151 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/867a187b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java
----------------------------------------------------------------------
diff --git 
a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java 
b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java
index 434d82a..520eb1b 100644
--- 
a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java
+++ 
b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveMetaStoreChecker.java
@@ -714,6 +714,9 @@ public class TestHiveMetaStoreChecker {
   private void createDirectory(String partPath) throws IOException {
     Path part = new Path(partPath);
     fs.mkdirs(part);
+    // create files under partitions to simulate real partitions
+    fs.createNewFile(new Path(partPath + Path.SEPARATOR + "dummydata1"));
+    fs.createNewFile(new Path(partPath + Path.SEPARATOR + "dummydata2"));
     fs.deleteOnExit(part);
   }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/867a187b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java
----------------------------------------------------------------------
diff --git 
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java
 
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java
index 2df45f6..6f4400a 100644
--- 
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java
+++ 
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStoreChecker.java
@@ -45,6 +45,7 @@ import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.ThreadFactory;
 
+import com.google.common.annotations.VisibleForTesting;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@@ -474,10 +475,13 @@ public class HiveMetaStoreChecker {
         throws IOException, MetastoreException {
       final Path currentPath = pd.p;
       final int currentDepth = pd.depth;
+      if (currentDepth == partColNames.size()) {
+        return currentPath;
+      }
       FileStatus[] fileStatuses = fs.listStatus(currentPath, 
FileUtils.HIDDEN_FILES_PATH_FILTER);
       // found no files under a sub-directory under table base path; it is 
possible that the table
       // is empty and hence there are no partition sub-directories created 
under base path
-      if (fileStatuses.length == 0 && currentDepth > 0 && currentDepth < 
partColNames.size()) {
+      if (fileStatuses.length == 0 && currentDepth > 0) {
         // since maxDepth is not yet reached, we are missing partition
         // columns in currentPath
         logOrThrowExceptionWithMsg(
@@ -485,12 +489,12 @@ public class HiveMetaStoreChecker {
       } else {
         // found files under currentPath add them to the queue if it is a 
directory
         for (FileStatus fileStatus : fileStatuses) {
-          if (!fileStatus.isDirectory() && currentDepth < partColNames.size()) 
{
+          if (!fileStatus.isDirectory()) {
             // found a file at depth which is less than number of partition 
keys
             logOrThrowExceptionWithMsg(
                 "MSCK finds a file rather than a directory when it searches 
for "
                     + fileStatus.getPath().toString());
-          } else if (fileStatus.isDirectory() && currentDepth < 
partColNames.size()) {
+          } else {
             // found a sub-directory at a depth less than number of partition 
keys
             // validate if the partition directory name matches with the 
corresponding
             // partition colName at currentDepth
@@ -507,9 +511,6 @@ public class HiveMetaStoreChecker {
             }
           }
         }
-        if (currentDepth == partColNames.size()) {
-          return currentPath;
-        }
       }
       return null;
     }
@@ -532,7 +533,8 @@ public class HiveMetaStoreChecker {
     }
   }
 
-  private void checkPartitionDirs(final ExecutorService executor,
+  @VisibleForTesting
+  void checkPartitionDirs(final ExecutorService executor,
       final Path basePath, final Set<Path> result,
       final FileSystem fs, final List<String> partColNames) throws 
MetastoreException {
     try {
@@ -563,7 +565,7 @@ public class HiveMetaStoreChecker {
         nextLevel = tempQueue;
       }
     } catch (InterruptedException | ExecutionException e) {
-      LOG.error(e.getMessage());
+      LOG.error("Exception received while listing partition directories", e);
       executor.shutdownNow();
       throw new MetastoreException(e.getCause());
     }

http://git-wip-us.apache.org/repos/asf/hive/blob/867a187b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestMsckCheckPartitions.java
----------------------------------------------------------------------
diff --git 
a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestMsckCheckPartitions.java
 
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestMsckCheckPartitions.java
new file mode 100644
index 0000000..980423e
--- /dev/null
+++ 
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestMsckCheckPartitions.java
@@ -0,0 +1,138 @@
+/*
+ *
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.apache.hadoop.hive.metastore;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest;
+import org.apache.hadoop.hive.metastore.api.MetastoreException;
+import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
+import org.apache.hadoop.hive.metastore.utils.FileUtils;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.mockito.Mockito;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+@Category(MetastoreUnitTest.class)
+public class TestMsckCheckPartitions {
+
+  /**
+   * Test counts the number of listStatus calls in the msck core method of
+   * listing sub-directories. This is important to check since it unnecessary
+   * listStatus calls could cause performance degradation in remote filesystems
+   * like S3. The test creates a mock FileSystem object and a mock directory 
structure
+   * to simulate a table which has 2 partition keys and 2 partition values at 
each level.
+   * In the end it counts how many times the listStatus is called on the mock 
filesystem
+   * and confirm its equal to the current theoretical value.
+   *
+   * @throws IOException
+   * @throws MetastoreException
+   */
+  @Test
+  public void testNumberOfListStatusCalls() throws IOException, 
MetastoreException {
+    LocalFileSystem mockFs = Mockito.mock(LocalFileSystem.class);
+    Path tableLocation = new Path("mock:///tmp/testTable");
+
+    Path countryUS = new Path(tableLocation, "country=US");
+    Path countryIND = new Path(tableLocation, "country=IND");
+
+    Path cityPA = new Path(countryUS, "city=PA");
+    Path citySF = new Path(countryUS, "city=SF");
+    Path cityBOM = new Path(countryIND, "city=BOM");
+    Path cityDEL = new Path(countryIND, "city=DEL");
+
+    Path paData = new Path(cityPA, "datafile");
+    Path sfData = new Path(citySF, "datafile");
+    Path bomData = new Path(cityBOM, "datafile");
+    Path delData = new Path(cityDEL, "datafile");
+
+    //level 1 listing
+    FileStatus[] allCountries = getMockFileStatus(countryUS, countryIND);
+    when(mockFs.listStatus(tableLocation, FileUtils.HIDDEN_FILES_PATH_FILTER))
+        .thenReturn(allCountries);
+
+    //level 2 listing
+    FileStatus[] filesInUS = getMockFileStatus(cityPA, citySF);
+    when(mockFs.listStatus(countryUS, 
FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(filesInUS);
+
+    FileStatus[] filesInInd = getMockFileStatus(cityBOM, cityDEL);
+    when(mockFs.listStatus(countryIND, 
FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(filesInInd);
+
+    //level 3 listing
+    FileStatus[] paFiles = getMockFileStatus(paData);
+    when(mockFs.listStatus(cityPA, 
FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(paFiles);
+
+    FileStatus[] sfFiles = getMockFileStatus(sfData);
+    when(mockFs.listStatus(citySF, 
FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(sfFiles);
+
+    FileStatus[] bomFiles = getMockFileStatus(bomData);
+    when(mockFs.listStatus(cityBOM, 
FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(bomFiles);
+
+    FileStatus[] delFiles = getMockFileStatus(delData);
+    when(mockFs.listStatus(cityDEL, 
FileUtils.HIDDEN_FILES_PATH_FILTER)).thenReturn(delFiles);
+
+    HiveMetaStoreChecker checker = new 
HiveMetaStoreChecker(Mockito.mock(IMetaStoreClient.class),
+        MetastoreConf.newMetastoreConf());
+    ExecutorService executorService = Executors.newFixedThreadPool(2);
+    Set<Path> result = new HashSet<>();
+    checker.checkPartitionDirs(executorService, tableLocation, result, mockFs,
+        Arrays.asList("country", "city"));
+    // if there are n partition columns, then number of times listStatus 
should be called
+    // must be equal
+    // to (numDirsAtLevel1) + (numDirsAtLevel2) + ... + (numDirAtLeveln-1)
+    // in this case it should 1 (table level) + 2 (US, IND)
+    verify(mockFs, times(3)).listStatus(any(Path.class), 
any(PathFilter.class));
+    Assert.assertEquals("msck should have found 4 unknown partitions", 4, 
result.size());
+  }
+
+  private FileStatus[] getMockFileStatus(Path... paths) throws IOException {
+    FileStatus[] result = new FileStatus[paths.length];
+    int i = 0;
+    for (Path p : paths) {
+      result[i++] = createMockFileStatus(p);
+    }
+    return result;
+  }
+
+  private FileStatus createMockFileStatus(Path p) {
+    FileStatus mock = Mockito.mock(FileStatus.class);
+    when(mock.getPath()).thenReturn(p);
+    if (p.toString().contains("datafile")) {
+      when(mock.isDirectory()).thenReturn(false);
+    } else {
+      when(mock.isDirectory()).thenReturn(true);
+    }
+    return mock;
+  }
+}

Reply via email to