This is an automated email from the ASF dual-hosted git repository.
roryqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git
The following commit(s) were added to refs/heads/master by this push:
new 517e5b16 [IMPROVEMENT] Add more metrics about local storage info (#205)
517e5b16 is described below
commit 517e5b16f335aacdef828ad0ea0be974a55d17c1
Author: Junfan Zhang <[email protected]>
AuthorDate: Fri Sep 9 23:07:28 2022 +0800
[IMPROVEMENT] Add more metrics about local storage info (#205)
### What changes were proposed in this pull request?
Introduce more local storage infos' metrics.
### Why are the changes needed?
In current codebase, there is no such local storage space info metrics.
After this PR, we could monitor the disk utilization easily.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
UTs.
---
.../org/apache/uniffle/server/HealthCheck.java | 2 ++
.../apache/uniffle/server/LocalStorageChecker.java | 15 ++++++++++
.../uniffle/server/ShuffleServerMetrics.java | 21 +++++++++++++
.../org/apache/uniffle/server/HealthCheckTest.java | 18 ++++++++++++
.../uniffle/server/ShuffleServerMetricsTest.java | 2 +-
.../apache/uniffle/server/StorageCheckerTest.java | 34 ++++++++++++++++++++++
6 files changed, 91 insertions(+), 1 deletion(-)
diff --git a/server/src/main/java/org/apache/uniffle/server/HealthCheck.java
b/server/src/main/java/org/apache/uniffle/server/HealthCheck.java
index dd14217b..ac21024b 100644
--- a/server/src/main/java/org/apache/uniffle/server/HealthCheck.java
+++ b/server/src/main/java/org/apache/uniffle/server/HealthCheck.java
@@ -83,9 +83,11 @@ public class HealthCheck {
for (Checker checker : checkers) {
if (!checker.checkIsHealthy()) {
isHealthy.set(false);
+ ShuffleServerMetrics.gaugeIsHealthy.set(1);
return;
}
}
+ ShuffleServerMetrics.gaugeIsHealthy.set(0);
isHealthy.set(true);
}
diff --git
a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
index c0163b39..81ccfe24 100644
--- a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
+++ b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
@@ -66,16 +66,31 @@ public class LocalStorageChecker extends Checker {
@Override
public boolean checkIsHealthy() {
int num = 0;
+ Long totalSpace = 0L;
+ Long usedSpace = 0L;
+ int corruptedDirs = 0;
+
for (StorageInfo storageInfo : storageInfos) {
if (!storageInfo.checkStorageReadAndWrite()) {
storageInfo.markCorrupted();
+ corruptedDirs++;
continue;
}
+
+ totalSpace += getTotalSpace(storageInfo.storageDir);
+ usedSpace += getUsedSpace(storageInfo.storageDir);
+
if (storageInfo.checkIsSpaceEnough()) {
num++;
}
}
+ ShuffleServerMetrics.gaugeLocalStorageTotalSpace.set(totalSpace);
+ ShuffleServerMetrics.gaugeLocalStorageUsedSpace.set(usedSpace);
+
ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.set(storageInfos.size());
+ ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.set(corruptedDirs);
+
ShuffleServerMetrics.gaugeLocalStorageUsedSpaceRatio.set(usedSpace.doubleValue()
/ totalSpace.doubleValue());
+
if (storageInfos.isEmpty()) {
if (isHealthy) {
LOG.info("shuffle server become unhealthy because of empty storage");
diff --git
a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
index d035cd6c..88f921b2 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
@@ -53,6 +53,13 @@ public class ShuffleServerMetrics {
private static final String TOTAL_READ_MEMORY_DATA =
"total_read_memory_data";
private static final String TOTAL_READ_TIME = "total_read_time";
+ private static final String LOCAL_STORAGE_TOTAL_DIRS_NUM =
"local_storage_total_dirs_num";
+ private static final String LOCAL_STORAGE_CORRUPTED_DIRS_NUM =
"local_storage_corrupted_dirs_num";
+ private static final String LOCAL_STORAGE_TOTAL_SPACE =
"local_storage_total_space";
+ private static final String LOCAL_STORAGE_USED_SPACE =
"local_storage_used_space";
+ private static final String LOCAL_STORAGE_USED_SPACE_RATIO =
"local_storage_used_space_ratio";
+
+ private static final String IS_HEALTHY = "is_healthy";
private static final String REGISTERED_SHUFFLE = "registered_shuffle";
private static final String REGISTERED_SHUFFLE_ENGINE =
"registered_shuffle_engine";
private static final String BUFFERED_DATA_SIZE = "buffered_data_size";
@@ -101,6 +108,13 @@ public class ShuffleServerMetrics {
public static Counter counterLocalStorageFailedWrite;
public static Counter counterLocalStorageSuccessWrite;
+ public static Gauge gaugeLocalStorageTotalDirsNum;
+ public static Gauge gaugeLocalStorageCorruptedDirsNum;
+ public static Gauge gaugeLocalStorageTotalSpace;
+ public static Gauge gaugeLocalStorageUsedSpace;
+ public static Gauge gaugeLocalStorageUsedSpaceRatio;
+
+ public static Gauge gaugeIsHealthy;
public static Gauge gaugeRegisteredShuffle;
public static Gauge gaugeRegisteredShuffleEngine;
public static Gauge gaugeBufferDataSize;
@@ -239,6 +253,13 @@ public class ShuffleServerMetrics {
counterLocalStorageFailedWrite =
metricsManager.addCounter(STORAGE_FAILED_WRITE_LOCAL);
counterLocalStorageSuccessWrite =
metricsManager.addCounter(STORAGE_SUCCESS_WRITE_LOCAL);
+ gaugeLocalStorageTotalDirsNum =
metricsManager.addGauge(LOCAL_STORAGE_TOTAL_DIRS_NUM);
+ gaugeLocalStorageCorruptedDirsNum =
metricsManager.addGauge(LOCAL_STORAGE_CORRUPTED_DIRS_NUM);
+ gaugeLocalStorageTotalSpace =
metricsManager.addGauge(LOCAL_STORAGE_TOTAL_SPACE);
+ gaugeLocalStorageUsedSpace =
metricsManager.addGauge(LOCAL_STORAGE_USED_SPACE);
+ gaugeLocalStorageUsedSpaceRatio =
metricsManager.addGauge(LOCAL_STORAGE_USED_SPACE_RATIO);
+
+ gaugeIsHealthy = metricsManager.addGauge(IS_HEALTHY);
gaugeRegisteredShuffle = metricsManager.addGauge(REGISTERED_SHUFFLE);
gaugeRegisteredShuffleEngine =
metricsManager.addGauge(REGISTERED_SHUFFLE_ENGINE);
gaugeBufferDataSize = metricsManager.addGauge(BUFFERED_DATA_SIZE);
diff --git
a/server/src/test/java/org/apache/uniffle/server/HealthCheckTest.java
b/server/src/test/java/org/apache/uniffle/server/HealthCheckTest.java
index ace60f41..fc202a44 100644
--- a/server/src/test/java/org/apache/uniffle/server/HealthCheckTest.java
+++ b/server/src/test/java/org/apache/uniffle/server/HealthCheckTest.java
@@ -21,15 +21,28 @@ import java.util.Arrays;
import java.util.concurrent.atomic.AtomicBoolean;
import com.google.common.collect.Lists;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.apache.uniffle.storage.util.StorageType;
+import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class HealthCheckTest {
+ @BeforeAll
+ public static void setup() {
+ ShuffleServerMetrics.register();
+ }
+
+ @AfterAll
+ public static void clear() {
+ ShuffleServerMetrics.clear();
+ }
+
@Test
public void buildInCheckerTest() {
ShuffleServerConf conf = new ShuffleServerConf();
@@ -67,15 +80,20 @@ public class HealthCheckTest {
HealthCheck checker = new HealthCheck(healthy, conf, Lists.newArrayList());
checker.check();
assertTrue(healthy.get());
+ assertEquals(0, ShuffleServerMetrics.gaugeIsHealthy.get());
+
conf.setString(ShuffleServerConf.HEALTH_CHECKER_CLASS_NAMES.key(),
UnHealthyMockChecker.class.getCanonicalName());
checker = new HealthCheck(healthy, conf, Lists.newArrayList());
checker.check();
assertFalse(healthy.get());
+ assertEquals(1, ShuffleServerMetrics.gaugeIsHealthy.get());
+
conf.setString(ShuffleServerConf.HEALTH_CHECKER_CLASS_NAMES.key(),
UnHealthyMockChecker.class.getCanonicalName() + "," +
HealthyMockChecker.class.getCanonicalName());
checker = new HealthCheck(healthy, conf, Lists.newArrayList());
checker.check();
assertFalse(healthy.get());
+ assertEquals(1, ShuffleServerMetrics.gaugeIsHealthy.get());
}
private void assertConf(ShuffleServerConf conf) {
diff --git
a/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java
b/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java
index 404a4fff..cd6b33ec 100644
---
a/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java
+++
b/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java
@@ -90,7 +90,7 @@ public class ShuffleServerMetricsTest {
JsonNode actualObj = mapper.readTree(content);
assertEquals(2, actualObj.size());
JsonNode metricsNode = actualObj.get("metrics");
- assertEquals(40, metricsNode.size());
+ assertEquals(46, metricsNode.size());
List<String> expectedMetricNames = Lists.newArrayList(
ShuffleServerMetrics.STORAGE_TOTAL_WRITE_REMOTE_PREFIX + STORAGE_HOST,
diff --git
a/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java
b/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java
index 1282d7bd..078f9502 100644
--- a/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java
+++ b/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java
@@ -22,11 +22,14 @@ import java.util.Arrays;
import java.util.List;
import com.google.common.collect.Lists;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.apache.uniffle.storage.common.LocalStorage;
import org.apache.uniffle.storage.util.StorageType;
+import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -34,6 +37,16 @@ public class StorageCheckerTest {
private int callTimes = 0;
+ @BeforeAll
+ public static void setup() {
+ ShuffleServerMetrics.register();
+ }
+
+ @AfterAll
+ public static void clear() {
+ ShuffleServerMetrics.clear();
+ }
+
@Test
public void checkTest() throws Exception {
ShuffleServerConf conf = new ShuffleServerConf();
@@ -48,22 +61,43 @@ public class StorageCheckerTest {
LocalStorageChecker checker = new MockStorageChecker(conf, storages);
assertTrue(checker.checkIsHealthy());
+ assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
+ assertEquals(600, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
+ assertEquals(0.2,
ShuffleServerMetrics.gaugeLocalStorageUsedSpaceRatio.get());
+ assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
+ assertEquals(0,
ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());
callTimes++;
assertTrue(checker.checkIsHealthy());
+ assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
+ assertEquals(1400, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
+ assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
+ assertEquals(0,
ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());
callTimes++;
assertFalse(checker.checkIsHealthy());
+ assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
+ assertEquals(2100, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
+ assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
+ assertEquals(0,
ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());
callTimes++;
assertTrue(checker.checkIsHealthy());
conf.set(ShuffleServerConf.HEALTH_MIN_STORAGE_PERCENTAGE, 80.0);
checker = new MockStorageChecker(conf, storages);
assertFalse(checker.checkIsHealthy());
+ assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
+ assertEquals(1600, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
+ assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
+ assertEquals(0,
ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());
callTimes++;
checker.checkIsHealthy();
assertTrue(checker.checkIsHealthy());
+ assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
+ assertEquals(250, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
+ assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
+ assertEquals(0,
ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());
}
private class MockStorageChecker extends LocalStorageChecker {