This is an automated email from the ASF dual-hosted git repository.

zuston pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git


The following commit(s) were added to refs/heads/master by this push:
     new 56f49947b [#2241] feat(server): Introduce option to mark server 
unhealthy once any storage corrupted (#2245)
56f49947b is described below

commit 56f49947be1233bf0d4d83eabd8a1b822e07c2c3
Author: Junfan Zhang <[email protected]>
AuthorDate: Mon Nov 11 19:37:55 2024 +0800

    [#2241] feat(server): Introduce option to mark server unhealthy once any 
storage corrupted (#2245)
    
    ### What changes were proposed in this pull request?
    
    Introduce option to mark server unhealthy once any storage corrupted.
    
    ### Why are the changes needed?
    
    For: #2241
    This feature is to reduce the impact while the local directories are 
corrupted.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes.
    
    `rss.server.health.markUnhealthyOnceStorageCorruption` is introduced, the 
default value is false that will not activate this feature by default.
    
    
    ### How was this patch tested?
    
    Existing unit tests.
    
    Co-authored-by: Junfan Zhang <[email protected]>
---
 .../java/org/apache/uniffle/server/LocalStorageChecker.java | 13 +++++++++++++
 .../java/org/apache/uniffle/server/ShuffleServerConf.java   |  7 +++++++
 2 files changed, 20 insertions(+)

diff --git 
a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java 
b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
index 10b12e74c..662fccc16 100644
--- a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
+++ b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
@@ -62,6 +62,7 @@ public class LocalStorageChecker extends Checker {
   private boolean isHealthy = true;
   private ExecutorService workers;
   private ReconfigurableConfManager.Reconfigurable<Long> 
diskCheckerExecutionTimeoutMs;
+  private boolean markUnhealthyOnceDirCorrupted = false;
 
   public LocalStorageChecker(ShuffleServerConf conf, List<LocalStorage> 
storages) {
     super(conf);
@@ -88,6 +89,9 @@ public class LocalStorageChecker extends Checker {
     this.diskCheckerExecutionTimeoutMs =
         
conf.getReconfigurableConf(ShuffleServerConf.HEALTH_CHECKER_LOCAL_STORAGE_EXECUTE_TIMEOUT);
     this.workers = Executors.newFixedThreadPool(basePaths.size());
+
+    this.markUnhealthyOnceDirCorrupted =
+        conf.get(ShuffleServerConf.SERVER_UNHEALTHY_ONCE_STORAGE_CORRUPTION);
   }
 
   @Override
@@ -179,6 +183,15 @@ public class LocalStorageChecker extends Checker {
       return false;
     }
 
+    if (markUnhealthyOnceDirCorrupted && corruptedDirs.get() > 0) {
+      if (isHealthy) {
+        LOG.info(
+            "shuffle server become unhealthy because {} corrupted dirs exist", 
corruptedDirs.get());
+      }
+      isHealthy = false;
+      return false;
+    }
+
     double availablePercentage = num.get() * 100.0 / storageInfos.size();
     if (Double.compare(availablePercentage, minStorageHealthyPercentage) >= 0) 
{
       if (!isHealthy) {
diff --git 
a/server/src/main/java/org/apache/uniffle/server/ShuffleServerConf.java 
b/server/src/main/java/org/apache/uniffle/server/ShuffleServerConf.java
index 677127b3b..a1a4f1f9e 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerConf.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerConf.java
@@ -267,6 +267,13 @@ public class ShuffleServerConf extends RssBaseConf {
           .defaultValue(2 * 1024L * 1024L)
           .withDescription("The index file size hint");
 
+  public static final ConfigOption<Boolean> 
SERVER_UNHEALTHY_ONCE_STORAGE_CORRUPTION =
+      ConfigOptions.key("rss.server.health.markUnhealthyOnceStorageCorruption")
+          .booleanType()
+          .defaultValue(false)
+          .withDescription(
+              "Mark server unhealthy once any storage corrupted. Default value 
is false");
+
   public static final ConfigOption<Double> HEALTH_STORAGE_MAX_USAGE_PERCENTAGE 
=
       ConfigOptions.key("rss.server.health.max.storage.usage.percentage")
           .doubleType()

Reply via email to