This is an automated email from the ASF dual-hosted git repository. roryqi pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git
The following commit(s) were added to refs/heads/master by this push: new 517e5b16 [IMPROVEMENT] Add more metrics about local storage info (#205) 517e5b16 is described below commit 517e5b16f335aacdef828ad0ea0be974a55d17c1 Author: Junfan Zhang <junfan.zh...@outlook.com> AuthorDate: Fri Sep 9 23:07:28 2022 +0800 [IMPROVEMENT] Add more metrics about local storage info (#205) ### What changes were proposed in this pull request? Introduce more local storage infos' metrics. ### Why are the changes needed? In current codebase, there is no such local storage space info metrics. After this PR, we could monitor the disk utilization easily. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? UTs. --- .../org/apache/uniffle/server/HealthCheck.java | 2 ++ .../apache/uniffle/server/LocalStorageChecker.java | 15 ++++++++++ .../uniffle/server/ShuffleServerMetrics.java | 21 +++++++++++++ .../org/apache/uniffle/server/HealthCheckTest.java | 18 ++++++++++++ .../uniffle/server/ShuffleServerMetricsTest.java | 2 +- .../apache/uniffle/server/StorageCheckerTest.java | 34 ++++++++++++++++++++++ 6 files changed, 91 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/apache/uniffle/server/HealthCheck.java b/server/src/main/java/org/apache/uniffle/server/HealthCheck.java index dd14217b..ac21024b 100644 --- a/server/src/main/java/org/apache/uniffle/server/HealthCheck.java +++ b/server/src/main/java/org/apache/uniffle/server/HealthCheck.java @@ -83,9 +83,11 @@ public class HealthCheck { for (Checker checker : checkers) { if (!checker.checkIsHealthy()) { isHealthy.set(false); + ShuffleServerMetrics.gaugeIsHealthy.set(1); return; } } + ShuffleServerMetrics.gaugeIsHealthy.set(0); isHealthy.set(true); } diff --git a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java index c0163b39..81ccfe24 100644 --- a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java +++ b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java @@ -66,16 +66,31 @@ public class LocalStorageChecker extends Checker { @Override public boolean checkIsHealthy() { int num = 0; + Long totalSpace = 0L; + Long usedSpace = 0L; + int corruptedDirs = 0; + for (StorageInfo storageInfo : storageInfos) { if (!storageInfo.checkStorageReadAndWrite()) { storageInfo.markCorrupted(); + corruptedDirs++; continue; } + + totalSpace += getTotalSpace(storageInfo.storageDir); + usedSpace += getUsedSpace(storageInfo.storageDir); + if (storageInfo.checkIsSpaceEnough()) { num++; } } + ShuffleServerMetrics.gaugeLocalStorageTotalSpace.set(totalSpace); + ShuffleServerMetrics.gaugeLocalStorageUsedSpace.set(usedSpace); + ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.set(storageInfos.size()); + ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.set(corruptedDirs); + ShuffleServerMetrics.gaugeLocalStorageUsedSpaceRatio.set(usedSpace.doubleValue() / totalSpace.doubleValue()); + if (storageInfos.isEmpty()) { if (isHealthy) { LOG.info("shuffle server become unhealthy because of empty storage"); diff --git a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java index d035cd6c..88f921b2 100644 --- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java +++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java @@ -53,6 +53,13 @@ public class ShuffleServerMetrics { private static final String TOTAL_READ_MEMORY_DATA = "total_read_memory_data"; private static final String TOTAL_READ_TIME = "total_read_time"; + private static final String LOCAL_STORAGE_TOTAL_DIRS_NUM = "local_storage_total_dirs_num"; + private static final String LOCAL_STORAGE_CORRUPTED_DIRS_NUM = "local_storage_corrupted_dirs_num"; + private static final String LOCAL_STORAGE_TOTAL_SPACE = "local_storage_total_space"; + private static final String LOCAL_STORAGE_USED_SPACE = "local_storage_used_space"; + private static final String LOCAL_STORAGE_USED_SPACE_RATIO = "local_storage_used_space_ratio"; + + private static final String IS_HEALTHY = "is_healthy"; private static final String REGISTERED_SHUFFLE = "registered_shuffle"; private static final String REGISTERED_SHUFFLE_ENGINE = "registered_shuffle_engine"; private static final String BUFFERED_DATA_SIZE = "buffered_data_size"; @@ -101,6 +108,13 @@ public class ShuffleServerMetrics { public static Counter counterLocalStorageFailedWrite; public static Counter counterLocalStorageSuccessWrite; + public static Gauge gaugeLocalStorageTotalDirsNum; + public static Gauge gaugeLocalStorageCorruptedDirsNum; + public static Gauge gaugeLocalStorageTotalSpace; + public static Gauge gaugeLocalStorageUsedSpace; + public static Gauge gaugeLocalStorageUsedSpaceRatio; + + public static Gauge gaugeIsHealthy; public static Gauge gaugeRegisteredShuffle; public static Gauge gaugeRegisteredShuffleEngine; public static Gauge gaugeBufferDataSize; @@ -239,6 +253,13 @@ public class ShuffleServerMetrics { counterLocalStorageFailedWrite = metricsManager.addCounter(STORAGE_FAILED_WRITE_LOCAL); counterLocalStorageSuccessWrite = metricsManager.addCounter(STORAGE_SUCCESS_WRITE_LOCAL); + gaugeLocalStorageTotalDirsNum = metricsManager.addGauge(LOCAL_STORAGE_TOTAL_DIRS_NUM); + gaugeLocalStorageCorruptedDirsNum = metricsManager.addGauge(LOCAL_STORAGE_CORRUPTED_DIRS_NUM); + gaugeLocalStorageTotalSpace = metricsManager.addGauge(LOCAL_STORAGE_TOTAL_SPACE); + gaugeLocalStorageUsedSpace = metricsManager.addGauge(LOCAL_STORAGE_USED_SPACE); + gaugeLocalStorageUsedSpaceRatio = metricsManager.addGauge(LOCAL_STORAGE_USED_SPACE_RATIO); + + gaugeIsHealthy = metricsManager.addGauge(IS_HEALTHY); gaugeRegisteredShuffle = metricsManager.addGauge(REGISTERED_SHUFFLE); gaugeRegisteredShuffleEngine = metricsManager.addGauge(REGISTERED_SHUFFLE_ENGINE); gaugeBufferDataSize = metricsManager.addGauge(BUFFERED_DATA_SIZE); diff --git a/server/src/test/java/org/apache/uniffle/server/HealthCheckTest.java b/server/src/test/java/org/apache/uniffle/server/HealthCheckTest.java index ace60f41..fc202a44 100644 --- a/server/src/test/java/org/apache/uniffle/server/HealthCheckTest.java +++ b/server/src/test/java/org/apache/uniffle/server/HealthCheckTest.java @@ -21,15 +21,28 @@ import java.util.Arrays; import java.util.concurrent.atomic.AtomicBoolean; import com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.apache.uniffle.storage.util.StorageType; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; public class HealthCheckTest { + @BeforeAll + public static void setup() { + ShuffleServerMetrics.register(); + } + + @AfterAll + public static void clear() { + ShuffleServerMetrics.clear(); + } + @Test public void buildInCheckerTest() { ShuffleServerConf conf = new ShuffleServerConf(); @@ -67,15 +80,20 @@ public class HealthCheckTest { HealthCheck checker = new HealthCheck(healthy, conf, Lists.newArrayList()); checker.check(); assertTrue(healthy.get()); + assertEquals(0, ShuffleServerMetrics.gaugeIsHealthy.get()); + conf.setString(ShuffleServerConf.HEALTH_CHECKER_CLASS_NAMES.key(), UnHealthyMockChecker.class.getCanonicalName()); checker = new HealthCheck(healthy, conf, Lists.newArrayList()); checker.check(); assertFalse(healthy.get()); + assertEquals(1, ShuffleServerMetrics.gaugeIsHealthy.get()); + conf.setString(ShuffleServerConf.HEALTH_CHECKER_CLASS_NAMES.key(), UnHealthyMockChecker.class.getCanonicalName() + "," + HealthyMockChecker.class.getCanonicalName()); checker = new HealthCheck(healthy, conf, Lists.newArrayList()); checker.check(); assertFalse(healthy.get()); + assertEquals(1, ShuffleServerMetrics.gaugeIsHealthy.get()); } private void assertConf(ShuffleServerConf conf) { diff --git a/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java b/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java index 404a4fff..cd6b33ec 100644 --- a/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java +++ b/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java @@ -90,7 +90,7 @@ public class ShuffleServerMetricsTest { JsonNode actualObj = mapper.readTree(content); assertEquals(2, actualObj.size()); JsonNode metricsNode = actualObj.get("metrics"); - assertEquals(40, metricsNode.size()); + assertEquals(46, metricsNode.size()); List<String> expectedMetricNames = Lists.newArrayList( ShuffleServerMetrics.STORAGE_TOTAL_WRITE_REMOTE_PREFIX + STORAGE_HOST, diff --git a/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java b/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java index 1282d7bd..078f9502 100644 --- a/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java +++ b/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java @@ -22,11 +22,14 @@ import java.util.Arrays; import java.util.List; import com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.apache.uniffle.storage.common.LocalStorage; import org.apache.uniffle.storage.util.StorageType; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -34,6 +37,16 @@ public class StorageCheckerTest { private int callTimes = 0; + @BeforeAll + public static void setup() { + ShuffleServerMetrics.register(); + } + + @AfterAll + public static void clear() { + ShuffleServerMetrics.clear(); + } + @Test public void checkTest() throws Exception { ShuffleServerConf conf = new ShuffleServerConf(); @@ -48,22 +61,43 @@ public class StorageCheckerTest { LocalStorageChecker checker = new MockStorageChecker(conf, storages); assertTrue(checker.checkIsHealthy()); + assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get()); + assertEquals(600, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get()); + assertEquals(0.2, ShuffleServerMetrics.gaugeLocalStorageUsedSpaceRatio.get()); + assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get()); + assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get()); callTimes++; assertTrue(checker.checkIsHealthy()); + assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get()); + assertEquals(1400, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get()); + assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get()); + assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get()); callTimes++; assertFalse(checker.checkIsHealthy()); + assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get()); + assertEquals(2100, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get()); + assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get()); + assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get()); callTimes++; assertTrue(checker.checkIsHealthy()); conf.set(ShuffleServerConf.HEALTH_MIN_STORAGE_PERCENTAGE, 80.0); checker = new MockStorageChecker(conf, storages); assertFalse(checker.checkIsHealthy()); + assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get()); + assertEquals(1600, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get()); + assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get()); + assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get()); callTimes++; checker.checkIsHealthy(); assertTrue(checker.checkIsHealthy()); + assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get()); + assertEquals(250, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get()); + assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get()); + assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get()); } private class MockStorageChecker extends LocalStorageChecker {