This is an automated email from the ASF dual-hosted git repository.

zuston pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git


The following commit(s) were added to refs/heads/master by this push:
     new 7e63bdeb7 [#2350] improvement(coordinator): Add metrics of active/lost 
server number (#2351)
7e63bdeb7 is described below

commit 7e63bdeb76f242117dede092db966d7048e5dbb0
Author: Junfan Zhang <[email protected]>
AuthorDate: Sun Jan 26 14:33:43 2025 +0800

    [#2350] improvement(coordinator): Add metrics of active/lost server number 
(#2351)
    
    ### What changes were proposed in this pull request?
    
    Add metrics of active/lost server number
    
    for #2350
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Needn't
    
    ---------
    
    Co-authored-by: Junfan Zhang <[email protected]>
---
 .../org/apache/uniffle/coordinator/SimpleClusterManager.java  | 11 +++++++++++
 .../apache/uniffle/coordinator/metric/CoordinatorMetrics.java |  8 ++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git 
a/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java
 
b/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java
index d1daf565b..5d9e88db3 100644
--- 
a/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java
+++ 
b/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java
@@ -25,6 +25,7 @@ import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.OutputStreamWriter;
 import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -171,6 +172,16 @@ public class SimpleClusterManager implements 
ClusterManager {
 
       CoordinatorMetrics.gaugeUnhealthyServerNum.set(unhealthyNodes.size());
       CoordinatorMetrics.gaugeTotalServerNum.set(servers.size());
+      CoordinatorMetrics.gaugeLostServerNum.set(lostNodes.size());
+
+      // get the active server num.
+      Set<String> allServers = new HashSet<>(servers.keySet());
+      allServers.removeAll(excludedNodes);
+      for (ServerNode unhealthyNode : unhealthyNodes) {
+        allServers.remove(unhealthyNode.getId());
+      }
+      CoordinatorMetrics.gaugeActiveServerNum.set(allServers.size());
+
     } catch (Exception e) {
       LOG.warn("Error happened in nodesCheck", e);
     }
diff --git 
a/coordinator/src/main/java/org/apache/uniffle/coordinator/metric/CoordinatorMetrics.java
 
b/coordinator/src/main/java/org/apache/uniffle/coordinator/metric/CoordinatorMetrics.java
index a97892526..0b61011f6 100644
--- 
a/coordinator/src/main/java/org/apache/uniffle/coordinator/metric/CoordinatorMetrics.java
+++ 
b/coordinator/src/main/java/org/apache/uniffle/coordinator/metric/CoordinatorMetrics.java
@@ -33,7 +33,8 @@ import org.apache.uniffle.common.util.JavaUtils;
 import org.apache.uniffle.common.util.RssUtils;
 
 public class CoordinatorMetrics {
-
+  private static final String ACTIVE_SERVER_NUM = "active_server_num";
+  private static final String LOST_SERVER_NUM = "lost_server_num";
   private static final String TOTAL_SERVER_NUM = "total_server_num";
   private static final String RUNNING_APP_NUM = "running_app_num";
   private static final String TOTAL_APP_NUM = "total_app_num";
@@ -46,7 +47,8 @@ public class CoordinatorMetrics {
   public static final String REMOTE_STORAGE_IN_USED_PREFIX = 
"remote_storage_in_used_";
   public static final String APP_NUM_TO_USER = "app_num";
   public static final String USER_LABEL = "user_name";
-
+  public static Gauge gaugeLostServerNum;
+  public static Gauge gaugeActiveServerNum;
   public static Gauge gaugeTotalServerNum;
   public static Gauge gaugeExcludeServerNum;
   public static Gauge gaugeUnhealthyServerNum;
@@ -107,6 +109,8 @@ public class CoordinatorMetrics {
   }
 
   private static void setUpMetrics() {
+    gaugeLostServerNum = metricsManager.addGauge(LOST_SERVER_NUM);
+    gaugeActiveServerNum = metricsManager.addGauge(ACTIVE_SERVER_NUM);
     gaugeTotalServerNum = metricsManager.addGauge(TOTAL_SERVER_NUM);
     gaugeExcludeServerNum = metricsManager.addGauge(EXCLUDE_SERVER_NUM);
     gaugeUnhealthyServerNum = metricsManager.addGauge(UNHEALTHY_SERVER_NUM);

Reply via email to