This is an automated email from the ASF dual-hosted git repository.
zuston pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git
The following commit(s) were added to refs/heads/master by this push:
new 7e63bdeb7 [#2350] improvement(coordinator): Add metrics of active/lost
server number (#2351)
7e63bdeb7 is described below
commit 7e63bdeb76f242117dede092db966d7048e5dbb0
Author: Junfan Zhang <[email protected]>
AuthorDate: Sun Jan 26 14:33:43 2025 +0800
[#2350] improvement(coordinator): Add metrics of active/lost server number
(#2351)
### What changes were proposed in this pull request?
Add metrics of active/lost server number
for #2350
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Needn't
---------
Co-authored-by: Junfan Zhang <[email protected]>
---
.../org/apache/uniffle/coordinator/SimpleClusterManager.java | 11 +++++++++++
.../apache/uniffle/coordinator/metric/CoordinatorMetrics.java | 8 ++++++--
2 files changed, 17 insertions(+), 2 deletions(-)
diff --git
a/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java
b/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java
index d1daf565b..5d9e88db3 100644
---
a/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java
+++
b/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java
@@ -25,6 +25,7 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -171,6 +172,16 @@ public class SimpleClusterManager implements
ClusterManager {
CoordinatorMetrics.gaugeUnhealthyServerNum.set(unhealthyNodes.size());
CoordinatorMetrics.gaugeTotalServerNum.set(servers.size());
+ CoordinatorMetrics.gaugeLostServerNum.set(lostNodes.size());
+
+ // get the active server num.
+ Set<String> allServers = new HashSet<>(servers.keySet());
+ allServers.removeAll(excludedNodes);
+ for (ServerNode unhealthyNode : unhealthyNodes) {
+ allServers.remove(unhealthyNode.getId());
+ }
+ CoordinatorMetrics.gaugeActiveServerNum.set(allServers.size());
+
} catch (Exception e) {
LOG.warn("Error happened in nodesCheck", e);
}
diff --git
a/coordinator/src/main/java/org/apache/uniffle/coordinator/metric/CoordinatorMetrics.java
b/coordinator/src/main/java/org/apache/uniffle/coordinator/metric/CoordinatorMetrics.java
index a97892526..0b61011f6 100644
---
a/coordinator/src/main/java/org/apache/uniffle/coordinator/metric/CoordinatorMetrics.java
+++
b/coordinator/src/main/java/org/apache/uniffle/coordinator/metric/CoordinatorMetrics.java
@@ -33,7 +33,8 @@ import org.apache.uniffle.common.util.JavaUtils;
import org.apache.uniffle.common.util.RssUtils;
public class CoordinatorMetrics {
-
+ private static final String ACTIVE_SERVER_NUM = "active_server_num";
+ private static final String LOST_SERVER_NUM = "lost_server_num";
private static final String TOTAL_SERVER_NUM = "total_server_num";
private static final String RUNNING_APP_NUM = "running_app_num";
private static final String TOTAL_APP_NUM = "total_app_num";
@@ -46,7 +47,8 @@ public class CoordinatorMetrics {
public static final String REMOTE_STORAGE_IN_USED_PREFIX =
"remote_storage_in_used_";
public static final String APP_NUM_TO_USER = "app_num";
public static final String USER_LABEL = "user_name";
-
+ public static Gauge gaugeLostServerNum;
+ public static Gauge gaugeActiveServerNum;
public static Gauge gaugeTotalServerNum;
public static Gauge gaugeExcludeServerNum;
public static Gauge gaugeUnhealthyServerNum;
@@ -107,6 +109,8 @@ public class CoordinatorMetrics {
}
private static void setUpMetrics() {
+ gaugeLostServerNum = metricsManager.addGauge(LOST_SERVER_NUM);
+ gaugeActiveServerNum = metricsManager.addGauge(ACTIVE_SERVER_NUM);
gaugeTotalServerNum = metricsManager.addGauge(TOTAL_SERVER_NUM);
gaugeExcludeServerNum = metricsManager.addGauge(EXCLUDE_SERVER_NUM);
gaugeUnhealthyServerNum = metricsManager.addGauge(UNHEALTHY_SERVER_NUM);