This is an automated email from the ASF dual-hosted git repository.

ethanfeng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git


The following commit(s) were added to refs/heads/main by this push:
     new c788c3802 [CELEBORN-1328] Introduce ActiveSlotsCount metric to monitor 
the number of active slots
c788c3802 is described below

commit c788c380256b011afa5184f7ddc8affe79c9d9e4
Author: CodingCat <[email protected]>
AuthorDate: Mon Apr 8 11:08:05 2024 +0800

    [CELEBORN-1328] Introduce ActiveSlotsCount metric to monitor the number of 
active slots
    
    ### What changes were proposed in this pull request?
    
    Introduce `ActiveSlots` metric to represent the disk resource demand 
currently in the cluster.
    
    ### Why are the changes needed?
    
    It's recommended to introduce `ActiveSlots` metric to represent the disk 
resource demand currently in the cluster.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    In our test cluster (we can see the value of activeSlots increases and then 
back to 0 after the application finished, and slotsAllocated is increasing all 
the way).
    
    
![image](https://github.com/apache/incubator-celeborn/assets/678008/c05aa763-11ad-4bbd-9ae0-dd6a9cb01ac5)
    
    Closes #2386 from CodingCat/slots_decrease.
    
    Lead-authored-by: CodingCat <[email protected]>
    Co-authored-by: Nan Zhu <[email protected]>
    Co-authored-by: Fei Wang <[email protected]>
    Signed-off-by: mingji <[email protected]>
---
 assets/grafana/celeborn-dashboard.json             | 90 ++++++++++++++++++++++
 .../apache/celeborn/common/meta/DeviceInfo.scala   |  2 +-
 docs/monitoring.md                                 |  2 +
 .../celeborn/service/deploy/worker/Worker.scala    |  3 +
 .../service/deploy/worker/WorkerSource.scala       |  1 +
 5 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/assets/grafana/celeborn-dashboard.json 
b/assets/grafana/celeborn-dashboard.json
index cf327a9dd..a8fe453dd 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -1281,6 +1281,96 @@
           "title": "metrics_SlotsAllocated_increase_1h",
           "type": "timeseries"
         },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green"
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 290
+          },
+          "id": 48,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${DS_PROMETHEUS}"
+              },
+              "expr": "metrics_ActiveSlotsCount_Value",
+              "legendFormat": "${baseLegend}",
+              "range": true,
+              "refId": "A"
+            }
+          ],
+          "title": "metrics_ActiveSlotsCount_Value",
+          "type": "timeseries"
+        },
         {
           "datasource": {
             "type": "prometheus",
diff --git 
a/common/src/main/scala/org/apache/celeborn/common/meta/DeviceInfo.scala 
b/common/src/main/scala/org/apache/celeborn/common/meta/DeviceInfo.scala
index e3e547407..b0438bd1b 100644
--- a/common/src/main/scala/org/apache/celeborn/common/meta/DeviceInfo.scala
+++ b/common/src/main/scala/org/apache/celeborn/common/meta/DeviceInfo.scala
@@ -141,7 +141,7 @@ class DiskInfo(
       shuffleAllocations.put(shuffleKey, shuffleAllocated - slots)
       applicationAllocations.put(applicationId, applicationAllocated - slots)
     }
-    activeSlots = activeSlots - Math.min(shuffleAllocated, slots)
+    activeSlots -= Math.min(shuffleAllocated, slots)
   }
 
   def releaseSlots(shuffleKey: String): Unit = this.synchronized {
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 11d03be6b..042c6dbb1 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -199,6 +199,8 @@ These metrics are exposed by Celeborn worker.
     - CommitFilesTime
         - The time for a worker to flush buffers and close files related to 
specified shuffle.
     - SlotsAllocated
+    - ActiveSlotsCount
+        - The number of slots currently being used in a worker 
     - ReserveSlotsTime
     - ActiveConnectionCount
     - NettyMemory
diff --git 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
index 88ad05c4e..0da4b5513 100644
--- 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
+++ 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
@@ -396,6 +396,9 @@ private[celeborn] class Worker(
   workerSource.addGauge(WorkerSource.PAUSE_PUSH_DATA_AND_REPLICATE_COUNT) { () 
=>
     memoryManager.getPausePushDataAndReplicateCounter
   }
+  workerSource.addGauge(WorkerSource.ACTIVE_SLOTS_COUNT) { () =>
+    workerInfo.usedSlots()
+  }
 
   private def highWorkload: Boolean = {
     (memoryManager.currentServingState, conf.workerActiveConnectionMax) match {
diff --git 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index 034be2d55..6be2a9256 100644
--- 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++ 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -170,6 +170,7 @@ object WorkerSource {
 
   // slots
   val SLOTS_ALLOCATED = "SlotsAllocated"
+  val ACTIVE_SLOTS_COUNT = "ActiveSlotsCount"
   val RESERVE_SLOTS_TIME = "ReserveSlotsTime"
 
   // connection

Reply via email to