This is an automated email from the ASF dual-hosted git repository.

zuston pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git


The following commit(s) were added to refs/heads/master by this push:
     new 3ec3f416 [ISSUE-392] Fix the bug in the shuffle data cleanup checker 
that causes false reports of disk corruption (#393)
3ec3f416 is described below

commit 3ec3f416596504d78284985a6b6fd64f1ba0d26d
Author: Junfan Zhang <[email protected]>
AuthorDate: Thu Dec 8 19:19:06 2022 +0800

    [ISSUE-392] Fix the bug in the shuffle data cleanup checker that causes 
false reports of disk corruption (#393)
    
    ### What changes were proposed in this pull request?
    
    [ISSUE-392] Fix the bug in the shuffle data cleanup checker that causes 
false reports of disk corruption #393
    
    
    ### Why are the changes needed?
    
    Fix the bug in the shuffle data checker that causes false reports of disk 
corruption during cleanup.
    
    ```
    [INFO] 2022-12-07 16:27:51,411 leakShuffleDataChecker ShuffleTaskManager 
checkLeakShuffleData - Start check leak shuffle data
    [INFO] 2022-12-07 16:27:51,416 leakShuffleDataChecker 
LocalFileDeleteHandler delete - Delete shuffle data for appId[check] with 
/data1/uniffle/data/check cost 0 ms
    [INFO] 2022-12-07 16:27:51,420 leakShuffleDataChecker 
LocalFileDeleteHandler delete - Delete shuffle data for appId[check] with 
/data2/uniffle/data/check cost 0 ms
    [INFO] 2022-12-07 16:27:51,420 leakShuffleDataChecker 
LocalFileDeleteHandler delete - Delete shuffle data for appId[check] with 
/data3/uniffle/data/check cost 0 ms
    [INFO] 2022-12-07 16:27:51,420 leakShuffleDataChecker 
LocalFileDeleteHandler delete - Delete shuffle data for appId[check] with 
/data4/uniffle/data/check cost 0 ms
    [INFO] 2022-12-07 16:27:51,420 leakShuffleDataChecker ShuffleTaskManager 
checkLeakShuffleData - Finish check leak shuffle data
    [ERROR] 2022-12-07 16:27:51,685 HealthCheckService LocalStorageChecker 
checkStorageReadAndWrite - Storage read and write error
    java.io.FileNotFoundException: /data4/uniffle/data/check/test (No such file 
or directory)
            at java.io.FileInputStream.open0(Native Method)
            at java.io.FileInputStream.open(FileInputStream.java:195)
            at java.io.FileInputStream.<init>(FileInputStream.java:138)
            at 
org.apache.uniffle.server.LocalStorageChecker$StorageInfo.checkStorageReadAndWrite(LocalStorageChecker.java:180)
            at 
org.apache.uniffle.server.LocalStorageChecker.checkIsHealthy(LocalStorageChecker.java:73)
            at org.apache.uniffle.server.HealthCheck.check(HealthCheck.java:84)
            at 
org.apache.uniffle.server.HealthCheck.lambda$new$0(HealthCheck.java:70)
            at java.lang.Thread.run(Thread.java:745)
    [INFO] 2022-12-07 16:27:51,685 HealthCheckService LocalStorageChecker 
checkIsHealthy - shuffle server become unhealthy
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    1. UTs
---
 .../main/java/org/apache/uniffle/server/LocalStorageChecker.java  | 8 +++++---
 .../java/org/apache/uniffle/server/ShuffleTaskManagerTest.java    | 7 +++++++
 .../main/java/org/apache/uniffle/storage/common/LocalStorage.java | 2 +-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git 
a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java 
b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
index fc9475dd..89019393 100644
--- a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
+++ b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
@@ -37,6 +37,7 @@ import org.apache.uniffle.storage.util.ShuffleStorageUtils;
 public class LocalStorageChecker extends Checker {
 
   private static final Logger LOG = 
LoggerFactory.getLogger(LocalStorageChecker.class);
+  public static final String CHECKER_DIR_NAME = ".check";
 
   private final double diskMaxUsagePercentage;
   private final double diskRecoveryUsagePercentage;
@@ -164,7 +165,8 @@ public class LocalStorageChecker extends Checker {
       if (storage.isCorrupted()) {
         return false;
       }
-      File checkDir = new File(storageDir, "check");
+      // Use the hidden file to avoid being cleanup
+      File checkDir = new File(storageDir, CHECKER_DIR_NAME);
       try {
         if (!checkDir.mkdirs()) {
           return false;
@@ -196,13 +198,13 @@ public class LocalStorageChecker extends Checker {
           } while (readBytes != -1);
         }
       } catch (Exception e) {
-        LOG.error("Storage read and write error ", e);
+        LOG.error("Storage read and write error. Storage dir: {}", storageDir, 
e);
         return false;
       } finally {
         try {
           FileUtils.deleteDirectory(checkDir);
         } catch (IOException ioe) {
-          LOG.error("delete directory fail", ioe);
+          LOG.error("delete directory fail. Storage dir: {}", storageDir, ioe);
           return false;
         }
       }
diff --git 
a/server/src/test/java/org/apache/uniffle/server/ShuffleTaskManagerTest.java 
b/server/src/test/java/org/apache/uniffle/server/ShuffleTaskManagerTest.java
index 03d1e1b5..129806c3 100644
--- a/server/src/test/java/org/apache/uniffle/server/ShuffleTaskManagerTest.java
+++ b/server/src/test/java/org/apache/uniffle/server/ShuffleTaskManagerTest.java
@@ -705,8 +705,14 @@ public class ShuffleTaskManagerTest extends HdfsTestBase {
     // make sure heartbeat timeout and resources are removed
     Thread.sleep(5000);
 
+    // Create the hidden dir to simulate LocalStorageChecker's check
+    String storageDir = tempDir.getAbsolutePath();
+    File hiddenFile = new File(storageDir + "/" + 
LocalStorageChecker.CHECKER_DIR_NAME);
+    hiddenFile.mkdir();
+
     appIdsOnDisk = getAppIdsOnDisk(localStorageManager);
     assertFalse(appIdsOnDisk.contains(appId));
+    assertFalse(appIdsOnDisk.contains(LocalStorageChecker.CHECKER_DIR_NAME));
 
     // mock leak shuffle data
     File file = new File(tempDir, appId);
@@ -717,6 +723,7 @@ public class ShuffleTaskManagerTest extends HdfsTestBase {
     // execute checkLeakShuffleData
     shuffleTaskManager.checkLeakShuffleData();
     assertFalse(file.exists());
+    assertTrue(hiddenFile.exists());
   }
 
   private Set<String> getAppIdsOnDisk(LocalStorageManager localStorageManager) 
{
diff --git 
a/storage/src/main/java/org/apache/uniffle/storage/common/LocalStorage.java 
b/storage/src/main/java/org/apache/uniffle/storage/common/LocalStorage.java
index 185ce58f..e5dd840a 100644
--- a/storage/src/main/java/org/apache/uniffle/storage/common/LocalStorage.java
+++ b/storage/src/main/java/org/apache/uniffle/storage/common/LocalStorage.java
@@ -323,7 +323,7 @@ public class LocalStorage extends AbstractStorage {
     File[] files = baseFolder.listFiles();
     if (files != null) {
       for (File file : files) {
-        if (file.isDirectory()) {
+        if (file.isDirectory() && !file.isHidden()) {
           appIds.add(file.getName());
         }
       }

Reply via email to