This is an automated email from the ASF dual-hosted git repository.

roryqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git


The following commit(s) were added to refs/heads/master by this push:
     new 517e5b16 [IMPROVEMENT] Add more metrics about local storage info (#205)
517e5b16 is described below

commit 517e5b16f335aacdef828ad0ea0be974a55d17c1
Author: Junfan Zhang <junfan.zh...@outlook.com>
AuthorDate: Fri Sep 9 23:07:28 2022 +0800

    [IMPROVEMENT] Add more metrics about local storage info (#205)
    
    ### What changes were proposed in this pull request?
    Introduce more local storage infos' metrics.
    
    ### Why are the changes needed?
    In current codebase, there is no such local storage space info metrics. 
After this PR, we could monitor the disk utilization easily.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    UTs.
---
 .../org/apache/uniffle/server/HealthCheck.java     |  2 ++
 .../apache/uniffle/server/LocalStorageChecker.java | 15 ++++++++++
 .../uniffle/server/ShuffleServerMetrics.java       | 21 +++++++++++++
 .../org/apache/uniffle/server/HealthCheckTest.java | 18 ++++++++++++
 .../uniffle/server/ShuffleServerMetricsTest.java   |  2 +-
 .../apache/uniffle/server/StorageCheckerTest.java  | 34 ++++++++++++++++++++++
 6 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/server/src/main/java/org/apache/uniffle/server/HealthCheck.java 
b/server/src/main/java/org/apache/uniffle/server/HealthCheck.java
index dd14217b..ac21024b 100644
--- a/server/src/main/java/org/apache/uniffle/server/HealthCheck.java
+++ b/server/src/main/java/org/apache/uniffle/server/HealthCheck.java
@@ -83,9 +83,11 @@ public class HealthCheck {
     for (Checker checker : checkers) {
       if (!checker.checkIsHealthy()) {
         isHealthy.set(false);
+        ShuffleServerMetrics.gaugeIsHealthy.set(1);
         return;
       }
     }
+    ShuffleServerMetrics.gaugeIsHealthy.set(0);
     isHealthy.set(true);
   }
 
diff --git 
a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java 
b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
index c0163b39..81ccfe24 100644
--- a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
+++ b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java
@@ -66,16 +66,31 @@ public class LocalStorageChecker extends Checker {
   @Override
   public boolean checkIsHealthy() {
     int num = 0;
+    Long totalSpace = 0L;
+    Long usedSpace = 0L;
+    int corruptedDirs = 0;
+
     for (StorageInfo storageInfo : storageInfos) {
       if (!storageInfo.checkStorageReadAndWrite()) {
         storageInfo.markCorrupted();
+        corruptedDirs++;
         continue;
       }
+
+      totalSpace += getTotalSpace(storageInfo.storageDir);
+      usedSpace += getUsedSpace(storageInfo.storageDir);
+
       if (storageInfo.checkIsSpaceEnough()) {
         num++;
       }
     }
 
+    ShuffleServerMetrics.gaugeLocalStorageTotalSpace.set(totalSpace);
+    ShuffleServerMetrics.gaugeLocalStorageUsedSpace.set(usedSpace);
+    
ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.set(storageInfos.size());
+    ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.set(corruptedDirs);
+    
ShuffleServerMetrics.gaugeLocalStorageUsedSpaceRatio.set(usedSpace.doubleValue()
 / totalSpace.doubleValue());
+
     if (storageInfos.isEmpty()) {
       if (isHealthy) {
         LOG.info("shuffle server become unhealthy because of empty storage");
diff --git 
a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java 
b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
index d035cd6c..88f921b2 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
@@ -53,6 +53,13 @@ public class ShuffleServerMetrics {
   private static final String TOTAL_READ_MEMORY_DATA = 
"total_read_memory_data";
   private static final String TOTAL_READ_TIME = "total_read_time";
 
+  private static final String LOCAL_STORAGE_TOTAL_DIRS_NUM = 
"local_storage_total_dirs_num";
+  private static final String LOCAL_STORAGE_CORRUPTED_DIRS_NUM = 
"local_storage_corrupted_dirs_num";
+  private static final String LOCAL_STORAGE_TOTAL_SPACE = 
"local_storage_total_space";
+  private static final String LOCAL_STORAGE_USED_SPACE = 
"local_storage_used_space";
+  private static final String LOCAL_STORAGE_USED_SPACE_RATIO = 
"local_storage_used_space_ratio";
+
+  private static final String IS_HEALTHY = "is_healthy";
   private static final String REGISTERED_SHUFFLE = "registered_shuffle";
   private static final String REGISTERED_SHUFFLE_ENGINE = 
"registered_shuffle_engine";
   private static final String BUFFERED_DATA_SIZE = "buffered_data_size";
@@ -101,6 +108,13 @@ public class ShuffleServerMetrics {
   public static Counter counterLocalStorageFailedWrite;
   public static Counter counterLocalStorageSuccessWrite;
 
+  public static Gauge gaugeLocalStorageTotalDirsNum;
+  public static Gauge gaugeLocalStorageCorruptedDirsNum;
+  public static Gauge gaugeLocalStorageTotalSpace;
+  public static Gauge gaugeLocalStorageUsedSpace;
+  public static Gauge gaugeLocalStorageUsedSpaceRatio;
+
+  public static Gauge gaugeIsHealthy;
   public static Gauge gaugeRegisteredShuffle;
   public static Gauge gaugeRegisteredShuffleEngine;
   public static Gauge gaugeBufferDataSize;
@@ -239,6 +253,13 @@ public class ShuffleServerMetrics {
     counterLocalStorageFailedWrite = 
metricsManager.addCounter(STORAGE_FAILED_WRITE_LOCAL);
     counterLocalStorageSuccessWrite = 
metricsManager.addCounter(STORAGE_SUCCESS_WRITE_LOCAL);
 
+    gaugeLocalStorageTotalDirsNum = 
metricsManager.addGauge(LOCAL_STORAGE_TOTAL_DIRS_NUM);
+    gaugeLocalStorageCorruptedDirsNum = 
metricsManager.addGauge(LOCAL_STORAGE_CORRUPTED_DIRS_NUM);
+    gaugeLocalStorageTotalSpace = 
metricsManager.addGauge(LOCAL_STORAGE_TOTAL_SPACE);
+    gaugeLocalStorageUsedSpace = 
metricsManager.addGauge(LOCAL_STORAGE_USED_SPACE);
+    gaugeLocalStorageUsedSpaceRatio = 
metricsManager.addGauge(LOCAL_STORAGE_USED_SPACE_RATIO);
+
+    gaugeIsHealthy = metricsManager.addGauge(IS_HEALTHY);
     gaugeRegisteredShuffle = metricsManager.addGauge(REGISTERED_SHUFFLE);
     gaugeRegisteredShuffleEngine = 
metricsManager.addGauge(REGISTERED_SHUFFLE_ENGINE);
     gaugeBufferDataSize = metricsManager.addGauge(BUFFERED_DATA_SIZE);
diff --git 
a/server/src/test/java/org/apache/uniffle/server/HealthCheckTest.java 
b/server/src/test/java/org/apache/uniffle/server/HealthCheckTest.java
index ace60f41..fc202a44 100644
--- a/server/src/test/java/org/apache/uniffle/server/HealthCheckTest.java
+++ b/server/src/test/java/org/apache/uniffle/server/HealthCheckTest.java
@@ -21,15 +21,28 @@ import java.util.Arrays;
 import java.util.concurrent.atomic.AtomicBoolean;
 
 import com.google.common.collect.Lists;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 
 import org.apache.uniffle.storage.util.StorageType;
 
+import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 public class HealthCheckTest {
 
+  @BeforeAll
+  public static void setup() {
+    ShuffleServerMetrics.register();
+  }
+
+  @AfterAll
+  public static void clear() {
+    ShuffleServerMetrics.clear();
+  }
+
   @Test
   public void buildInCheckerTest() {
     ShuffleServerConf conf = new ShuffleServerConf();
@@ -67,15 +80,20 @@ public class HealthCheckTest {
     HealthCheck checker = new HealthCheck(healthy, conf, Lists.newArrayList());
     checker.check();
     assertTrue(healthy.get());
+    assertEquals(0, ShuffleServerMetrics.gaugeIsHealthy.get());
+
     conf.setString(ShuffleServerConf.HEALTH_CHECKER_CLASS_NAMES.key(), 
UnHealthyMockChecker.class.getCanonicalName());
     checker = new HealthCheck(healthy, conf, Lists.newArrayList());
     checker.check();
     assertFalse(healthy.get());
+    assertEquals(1, ShuffleServerMetrics.gaugeIsHealthy.get());
+
     conf.setString(ShuffleServerConf.HEALTH_CHECKER_CLASS_NAMES.key(),
         UnHealthyMockChecker.class.getCanonicalName() + "," + 
HealthyMockChecker.class.getCanonicalName());
     checker = new HealthCheck(healthy, conf, Lists.newArrayList());
     checker.check();
     assertFalse(healthy.get());
+    assertEquals(1, ShuffleServerMetrics.gaugeIsHealthy.get());
   }
 
   private void assertConf(ShuffleServerConf conf) {
diff --git 
a/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java 
b/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java
index 404a4fff..cd6b33ec 100644
--- 
a/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java
+++ 
b/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java
@@ -90,7 +90,7 @@ public class ShuffleServerMetricsTest {
     JsonNode actualObj = mapper.readTree(content);
     assertEquals(2, actualObj.size());
     JsonNode metricsNode = actualObj.get("metrics");
-    assertEquals(40, metricsNode.size());
+    assertEquals(46, metricsNode.size());
 
     List<String> expectedMetricNames = Lists.newArrayList(
         ShuffleServerMetrics.STORAGE_TOTAL_WRITE_REMOTE_PREFIX + STORAGE_HOST,
diff --git 
a/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java 
b/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java
index 1282d7bd..078f9502 100644
--- a/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java
+++ b/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java
@@ -22,11 +22,14 @@ import java.util.Arrays;
 import java.util.List;
 
 import com.google.common.collect.Lists;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 
 import org.apache.uniffle.storage.common.LocalStorage;
 import org.apache.uniffle.storage.util.StorageType;
 
+import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
@@ -34,6 +37,16 @@ public class StorageCheckerTest {
 
   private int callTimes = 0;
 
+  @BeforeAll
+  public static void setup() {
+    ShuffleServerMetrics.register();
+  }
+
+  @AfterAll
+  public static void clear() {
+    ShuffleServerMetrics.clear();
+  }
+
   @Test
   public void checkTest() throws Exception {
     ShuffleServerConf conf = new ShuffleServerConf();
@@ -48,22 +61,43 @@ public class StorageCheckerTest {
     LocalStorageChecker checker = new MockStorageChecker(conf, storages);
 
     assertTrue(checker.checkIsHealthy());
+    assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
+    assertEquals(600, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
+    assertEquals(0.2, 
ShuffleServerMetrics.gaugeLocalStorageUsedSpaceRatio.get());
+    assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
+    assertEquals(0, 
ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());
 
     callTimes++;
     assertTrue(checker.checkIsHealthy());
+    assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
+    assertEquals(1400, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
+    assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
+    assertEquals(0, 
ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());
 
     callTimes++;
     assertFalse(checker.checkIsHealthy());
+    assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
+    assertEquals(2100, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
+    assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
+    assertEquals(0, 
ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());
 
     callTimes++;
     assertTrue(checker.checkIsHealthy());
     conf.set(ShuffleServerConf.HEALTH_MIN_STORAGE_PERCENTAGE, 80.0);
     checker = new MockStorageChecker(conf, storages);
     assertFalse(checker.checkIsHealthy());
+    assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
+    assertEquals(1600, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
+    assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
+    assertEquals(0, 
ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());
 
     callTimes++;
     checker.checkIsHealthy();
     assertTrue(checker.checkIsHealthy());
+    assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
+    assertEquals(250, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
+    assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
+    assertEquals(0, 
ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());
   }
 
   private class MockStorageChecker extends LocalStorageChecker {

Reply via email to