This is an automated email from the ASF dual-hosted git repository.
nicholasjiang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new a9490d6e2 [CELEBORN-2118] Introduce IsHighWorkload metric to monitor
worker overload status
a9490d6e2 is described below
commit a9490d6e24683e5f7a6526fff027600676913963
Author: xxx <[email protected]>
AuthorDate: Mon Aug 25 20:46:17 2025 +0800
[CELEBORN-2118] Introduce IsHighWorkload metric to monitor worker overload
status
### What changes were proposed in this pull request?
Introduce `IsHighWorkload` metric to monitor worker overload status.
### Why are the changes needed?
There is no any metric to monitor worker overload status at present.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
[Grafana
test](https://xy2953396112.grafana.net/public-dashboards/22ab1750ef874a1bb39b5879b81a24cf).
Closes #3435 from xy2953396112/CELEBORN-2118.
Authored-by: xxx <[email protected]>
Signed-off-by: SteNicholas <[email protected]>
---
assets/grafana/celeborn-dashboard.json | 88 ++++++++++++++++++++++
docs/monitoring.md | 1 +
.../celeborn/service/deploy/worker/Worker.scala | 7 ++
.../service/deploy/worker/WorkerSource.scala | 2 +
4 files changed, 98 insertions(+)
diff --git a/assets/grafana/celeborn-dashboard.json
b/assets/grafana/celeborn-dashboard.json
index f5603e9a2..3ad72ada9 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -525,6 +525,94 @@
],
"title": "metrics_RunningApplicationCount_Value",
"type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "Celeborn worker high workload status.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 148
+ },
+ "id": 97,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "expr":
"metrics_IsHighWorkload_Value{instance=~\"${instance}\"}",
+ "legendFormat": "${baseLegend}",
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_IsHighWorkload_Value",
+ "type": "timeseries"
}
],
"title": "Overall",
diff --git a/docs/monitoring.md b/docs/monitoring.md
index a455e3dcc..44a52e5b8 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -254,6 +254,7 @@ These metrics are exposed by Celeborn worker.
| UserProduceSpeed | The speed of user production
for congestion control.
|
| WorkerConsumeSpeed | The speed of worker consumption
for congestion control.
|
| IsDecommissioningWorker | 1 means worker decommissioning,
0 means not decommissioning.
|
+ | IsHighWorkload | 1 means worker high workload, 0
means not high workload. |
| UnreleasedShuffleCount | Unreleased shuffle count when
worker is decommissioning.
|
| UnreleasedPartitionLocationCount | Unreleased partition location
count when worker is shutting down.
|
| MemoryStorageFileCount | The count of files in Memory
Storage of a worker.
|
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
index 93208751d..5107dfd5f 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
@@ -440,6 +440,13 @@ private[celeborn] class Worker(
workerSource.addGauge(WorkerSource.PAUSE_PUSH_DATA_AND_REPLICATE_COUNT) { ()
=>
memoryManager.getPausePushDataAndReplicateCounter
}
+ workerSource.addGauge(WorkerSource.IS_HIGH_WORKLOAD) { () =>
+ if (highWorkload) {
+ 1
+ } else {
+ 0
+ }
+ }
workerSource.addGauge(WorkerSource.ACTIVE_SLOTS_COUNT) { () =>
workerInfo.usedSlots()
}
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index 1b94406c8..0d6328340 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -242,6 +242,8 @@ object WorkerSource {
val MEMORY_STORAGE_FILE_COUNT = "MemoryStorageFileCount"
+ val IS_HIGH_WORKLOAD = "IsHighWorkload"
+
// credit
val ACTIVE_CREDIT_STREAM_COUNT = "ActiveCreditStreamCount"
val ACTIVE_MAP_PARTITION_COUNT = "ActiveMapPartitionCount"