This is an automated email from the ASF dual-hosted git repository.
zuston pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git
The following commit(s) were added to refs/heads/master by this push:
new 3ec3f416 [ISSUE-392] Fix the bug in the shuffle data cleanup checker
that causes false reports of disk corruption (#393)
3ec3f416 is described below
commit 3ec3f416596504d78284985a6b6fd64f1ba0d26d
Author: Junfan Zhang <[email protected]>
AuthorDate: Thu Dec 8 19:19:06 2022 +0800
[ISSUE-392] Fix the bug in the shuffle data cleanup checker that causes
false reports of disk corruption (#393)
### What changes were proposed in this pull request?
[ISSUE-392] Fix the bug in the shuffle data cleanup checker that causes
false reports of disk corruption #393
### Why are the changes needed?
Fix the bug in the shuffle data checker that causes false reports of disk
corruption during cleanup.
```
[INFO] 2022-12-07 16:27:51,411 leakShuffleDataChecker ShuffleTaskManager
checkLeakShuffleData - Start check leak shuffle data
[INFO] 2022-12-07 16:27:51,416 leakShuffleDataChecker
LocalFileDeleteHandler delete - Delete shuffle data for appId[check] with
/data1/uniffle/data/check cost 0 ms
[INFO] 2022-12-07 16:27:51,420 leakShuffleDataChecker
LocalFileDeleteHandler delete - Delete shuffle data for appId[check] with
/data2/uniffle/data/check cost 0 ms
[INFO] 2022-12-07 16:27:51,420 leakShuffleDataChecker
LocalFileDeleteHandler delete - Delete shuffle data for appId[check] with
/data3/uniffle/data/check cost 0 ms
[INFO] 2022-12-07 16:27:51,420 leakShuffleDataChecker
LocalFileDeleteHandler delete - Delete shuffle data for appId[check] with
/data4/uniffle/data/check cost 0 ms
[INFO] 2022-12-07 16:27:51,420 leakShuffleDataChecker ShuffleTaskManager
checkLeakShuffleData - Finish check leak shuffle data
[ERROR] 2022-12-07 16:27:51,685 HealthCheckService LocalStorageChecker
checkStorageReadAndWrite - Storage read and write error
java.io.FileNotFoundException: /data4/uniffle/data/check/test (No such file
or directory)
at java.io.FileInputStream.open0(Native Method)
at java.io.FileInputStream.open(FileInputStream.java:195)
at java.io.FileInputStream.<init>(FileInputStream.java:138)
at
org.apache.uniffle.server.LocalStorageChecker$StorageInfo.checkStorageReadAndWrite(LocalStorageChecker.java:180)
at
org.apache.uniffle.server.LocalStorageChecker.checkIsHealthy(LocalStorageChecker.java:73)
at org.apache.uniffle.server.HealthCheck.check(HealthCheck.java:84)
at
org.apache.uniffle.server.HealthCheck.lambda$new$0(HealthCheck.java:70)
at java.lang.Thread.run(Thread.java:745)
[INFO] 2022-12-07 16:27:51,685 HealthCheckService LocalStorageChecker
checkIsHealthy - shuffle server become unhealthy
```
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
1. UTs
---
.../main/java/org/apache/uniffle/server/LocalStorageChecker.java | 8 +++++---
.../java/org/apache/uniffle/server/ShuffleTaskManagerTest.java | 7 +++++++
.../main/java/org/apache/uniffle/storage/common/LocalStorage.java | 2 +-
3 files changed, 13 insertions(+), 4 deletions(-)
diff --git
a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
index fc9475dd..89019393 100644
--- a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
+++ b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
@@ -37,6 +37,7 @@ import org.apache.uniffle.storage.util.ShuffleStorageUtils;
public class LocalStorageChecker extends Checker {
private static final Logger LOG =
LoggerFactory.getLogger(LocalStorageChecker.class);
+ public static final String CHECKER_DIR_NAME = ".check";
private final double diskMaxUsagePercentage;
private final double diskRecoveryUsagePercentage;
@@ -164,7 +165,8 @@ public class LocalStorageChecker extends Checker {
if (storage.isCorrupted()) {
return false;
}
- File checkDir = new File(storageDir, "check");
+ // Use the hidden file to avoid being cleanup
+ File checkDir = new File(storageDir, CHECKER_DIR_NAME);
try {
if (!checkDir.mkdirs()) {
return false;
@@ -196,13 +198,13 @@ public class LocalStorageChecker extends Checker {
} while (readBytes != -1);
}
} catch (Exception e) {
- LOG.error("Storage read and write error ", e);
+ LOG.error("Storage read and write error. Storage dir: {}", storageDir,
e);
return false;
} finally {
try {
FileUtils.deleteDirectory(checkDir);
} catch (IOException ioe) {
- LOG.error("delete directory fail", ioe);
+ LOG.error("delete directory fail. Storage dir: {}", storageDir, ioe);
return false;
}
}
diff --git
a/server/src/test/java/org/apache/uniffle/server/ShuffleTaskManagerTest.java
b/server/src/test/java/org/apache/uniffle/server/ShuffleTaskManagerTest.java
index 03d1e1b5..129806c3 100644
--- a/server/src/test/java/org/apache/uniffle/server/ShuffleTaskManagerTest.java
+++ b/server/src/test/java/org/apache/uniffle/server/ShuffleTaskManagerTest.java
@@ -705,8 +705,14 @@ public class ShuffleTaskManagerTest extends HdfsTestBase {
// make sure heartbeat timeout and resources are removed
Thread.sleep(5000);
+ // Create the hidden dir to simulate LocalStorageChecker's check
+ String storageDir = tempDir.getAbsolutePath();
+ File hiddenFile = new File(storageDir + "/" +
LocalStorageChecker.CHECKER_DIR_NAME);
+ hiddenFile.mkdir();
+
appIdsOnDisk = getAppIdsOnDisk(localStorageManager);
assertFalse(appIdsOnDisk.contains(appId));
+ assertFalse(appIdsOnDisk.contains(LocalStorageChecker.CHECKER_DIR_NAME));
// mock leak shuffle data
File file = new File(tempDir, appId);
@@ -717,6 +723,7 @@ public class ShuffleTaskManagerTest extends HdfsTestBase {
// execute checkLeakShuffleData
shuffleTaskManager.checkLeakShuffleData();
assertFalse(file.exists());
+ assertTrue(hiddenFile.exists());
}
private Set<String> getAppIdsOnDisk(LocalStorageManager localStorageManager)
{
diff --git
a/storage/src/main/java/org/apache/uniffle/storage/common/LocalStorage.java
b/storage/src/main/java/org/apache/uniffle/storage/common/LocalStorage.java
index 185ce58f..e5dd840a 100644
--- a/storage/src/main/java/org/apache/uniffle/storage/common/LocalStorage.java
+++ b/storage/src/main/java/org/apache/uniffle/storage/common/LocalStorage.java
@@ -323,7 +323,7 @@ public class LocalStorage extends AbstractStorage {
File[] files = baseFolder.listFiles();
if (files != null) {
for (File file : files) {
- if (file.isDirectory()) {
+ if (file.isDirectory() && !file.isHidden()) {
appIds.add(file.getName());
}
}