This is an automated email from the ASF dual-hosted git repository.
zuston pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git
The following commit(s) were added to refs/heads/master by this push:
new 56f49947b [#2241] feat(server): Introduce option to mark server
unhealthy once any storage corrupted (#2245)
56f49947b is described below
commit 56f49947be1233bf0d4d83eabd8a1b822e07c2c3
Author: Junfan Zhang <[email protected]>
AuthorDate: Mon Nov 11 19:37:55 2024 +0800
[#2241] feat(server): Introduce option to mark server unhealthy once any
storage corrupted (#2245)
### What changes were proposed in this pull request?
Introduce option to mark server unhealthy once any storage corrupted.
### Why are the changes needed?
For: #2241
This feature is to reduce the impact while the local directories are
corrupted.
### Does this PR introduce _any_ user-facing change?
Yes.
`rss.server.health.markUnhealthyOnceStorageCorruption` is introduced, the
default value is false that will not activate this feature by default.
### How was this patch tested?
Existing unit tests.
Co-authored-by: Junfan Zhang <[email protected]>
---
.../java/org/apache/uniffle/server/LocalStorageChecker.java | 13 +++++++++++++
.../java/org/apache/uniffle/server/ShuffleServerConf.java | 7 +++++++
2 files changed, 20 insertions(+)
diff --git
a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
index 10b12e74c..662fccc16 100644
--- a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
+++ b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
@@ -62,6 +62,7 @@ public class LocalStorageChecker extends Checker {
private boolean isHealthy = true;
private ExecutorService workers;
private ReconfigurableConfManager.Reconfigurable<Long>
diskCheckerExecutionTimeoutMs;
+ private boolean markUnhealthyOnceDirCorrupted = false;
public LocalStorageChecker(ShuffleServerConf conf, List<LocalStorage>
storages) {
super(conf);
@@ -88,6 +89,9 @@ public class LocalStorageChecker extends Checker {
this.diskCheckerExecutionTimeoutMs =
conf.getReconfigurableConf(ShuffleServerConf.HEALTH_CHECKER_LOCAL_STORAGE_EXECUTE_TIMEOUT);
this.workers = Executors.newFixedThreadPool(basePaths.size());
+
+ this.markUnhealthyOnceDirCorrupted =
+ conf.get(ShuffleServerConf.SERVER_UNHEALTHY_ONCE_STORAGE_CORRUPTION);
}
@Override
@@ -179,6 +183,15 @@ public class LocalStorageChecker extends Checker {
return false;
}
+ if (markUnhealthyOnceDirCorrupted && corruptedDirs.get() > 0) {
+ if (isHealthy) {
+ LOG.info(
+ "shuffle server become unhealthy because {} corrupted dirs exist",
corruptedDirs.get());
+ }
+ isHealthy = false;
+ return false;
+ }
+
double availablePercentage = num.get() * 100.0 / storageInfos.size();
if (Double.compare(availablePercentage, minStorageHealthyPercentage) >= 0)
{
if (!isHealthy) {
diff --git
a/server/src/main/java/org/apache/uniffle/server/ShuffleServerConf.java
b/server/src/main/java/org/apache/uniffle/server/ShuffleServerConf.java
index 677127b3b..a1a4f1f9e 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerConf.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerConf.java
@@ -267,6 +267,13 @@ public class ShuffleServerConf extends RssBaseConf {
.defaultValue(2 * 1024L * 1024L)
.withDescription("The index file size hint");
+ public static final ConfigOption<Boolean>
SERVER_UNHEALTHY_ONCE_STORAGE_CORRUPTION =
+ ConfigOptions.key("rss.server.health.markUnhealthyOnceStorageCorruption")
+ .booleanType()
+ .defaultValue(false)
+ .withDescription(
+ "Mark server unhealthy once any storage corrupted. Default value
is false");
+
public static final ConfigOption<Double> HEALTH_STORAGE_MAX_USAGE_PERCENTAGE
=
ConfigOptions.key("rss.server.health.max.storage.usage.percentage")
.doubleType()