This is an automated email from the ASF dual-hosted git repository.

nicholasjiang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git


The following commit(s) were added to refs/heads/main by this push:
     new e5cef1636 [CELEBORN-2209] Introduce ReadBufferUsageRatio metric to 
monitor credit stream read buffer usage
e5cef1636 is described below

commit e5cef163646ff4592d981aad8eb714ccd3833414
Author: SteNicholas <[email protected]>
AuthorDate: Mon Dec 1 12:06:40 2025 +0800

    [CELEBORN-2209] Introduce ReadBufferUsageRatio metric to monitor credit 
stream read buffer usage
    
    ### What changes were proposed in this pull request?
    
    Introduce `ReadBufferUsageRatio` metric to monitor credit stream read 
buffer usage.
    
    ### Why are the changes needed?
    
    `BufferStreamReadBuffer` metric is used to monitor the memory used by 
credit stream read buffer, which is not enough to monitor ratio of credit 
stream read buffer used and max direct memory.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    CI and grafana manual test with [celeborn 
dashboard](https://stenicholas.grafana.net/public-dashboards/12f47ac2ba2f4c0c88f761f98ffcf51c).
    
    Closes #3545 from SteNicholas/CELEBORN-2209.
    
    Authored-by: SteNicholas <[email protected]>
    Signed-off-by: SteNicholas <[email protected]>
---
 assets/grafana/celeborn-dashboard.json             | 95 ++++++++++++++++++++++
 docs/monitoring.md                                 |  3 +-
 .../deploy/worker/memory/MemoryManager.java        |  4 +
 .../celeborn/service/deploy/worker/Worker.scala    |  3 +
 .../service/deploy/worker/WorkerSource.scala       |  1 +
 5 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/assets/grafana/celeborn-dashboard.json 
b/assets/grafana/celeborn-dashboard.json
index 0dba8a844..2e62facaa 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -12710,6 +12710,101 @@
           ],
           "title": "AvailableReadBuffer",
           "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "description": "The ratio of read buffer used and max direct 
memory.",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green"
+                  },
+                  {
+                    "color": "red",
+                    "value": 35
+                  }
+                ]
+              },
+              "unit": "percentunit"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 289
+          },
+          "id": 109,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "maxHeight": 600,
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${DS_PROMETHEUS}"
+              },
+              "editorMode": "builder",
+              "expr": 
"metrics_ReadBufferUsageRatio_Value{instance=~\"${instance}\"}",
+              "instant": false,
+              "legendFormat": "${baseLegend}",
+              "range": true,
+              "refId": "A"
+            }
+          ],
+          "title": "ReadBufferUsageRatio",
+          "type": "timeseries"
         }
       ],
       "title": "MemoryRelatives",
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 18bb29c8f..4c7d4adfa 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -244,7 +244,8 @@ These metrics are exposed by Celeborn worker.
     | BufferStreamReadBuffer                 | The memory used by credit 
stream read buffer.                                                             
      |
     | ReadBufferDispatcherRequestsLength     | The queue size of read buffer 
allocation requests.                                                            
  |
     | ReadBufferAllocatedCount               | Allocated read buffer count.    
                                                                                
|
-  - | AvailableReadBuffer                    | The available memory for credit 
stream read buffer.                                                             
|
+    | AvailableReadBuffer                    | The available memory for credit 
stream read buffer.                                                             
|
+    | ReadBufferUsageRatio                   | Ratio of credit stream read 
buffer used and max direct memory.                                              
    |
     | ActiveCreditStreamCount                | Active stream count for map 
partition reading streams.                                                      
    |
     | ActiveMapPartitionCount                | The count of active map 
partition reading streams.                                                      
        |
     | SorterCacheHitRate                     | The cache hit rate for worker 
partition sorter index.                                                         
  |
diff --git 
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/memory/MemoryManager.java
 
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/memory/MemoryManager.java
index de17c4549..4920a5ce4 100644
--- 
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/memory/MemoryManager.java
+++ 
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/memory/MemoryManager.java
@@ -506,6 +506,10 @@ public class MemoryManager {
     return Math.max(0, readBufferThreshold - readBufferCounter.get());
   }
 
+  public double readBufferUsageRatio() {
+    return readBufferCounter.get() / (double) readBufferThreshold;
+  }
+
   public long getPausePushDataAndReplicateCounter() {
     return pausePushDataAndReplicateCounter.sum();
   }
diff --git 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
index 2ba3ba56a..65e0779c9 100644
--- 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
+++ 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
@@ -409,6 +409,9 @@ private[celeborn] class Worker(
   workerSource.addGauge(WorkerSource.AVAILABLE_READ_BUFFER) { () =>
     memoryManager.availableReadBuffer
   }
+  workerSource.addGauge(WorkerSource.READ_BUFFER_USAGE_RATIO) { () =>
+    memoryManager.readBufferUsageRatio
+  }
   workerSource.addGauge(WorkerSource.MEMORY_FILE_STORAGE_SIZE) { () =>
     memoryManager.getMemoryFileStorageCounter
   }
diff --git 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index f3f5446cc..7d2467333 100644
--- 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++ 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -243,6 +243,7 @@ object WorkerSource {
   val READ_BUFFER_DISPATCHER_REQUESTS_LENGTH = 
"ReadBufferDispatcherRequestsLength"
   val READ_BUFFER_ALLOCATED_COUNT = "ReadBufferAllocatedCount"
   val AVAILABLE_READ_BUFFER = "AvailableReadBuffer"
+  val READ_BUFFER_USAGE_RATIO = "ReadBufferUsageRatio"
   val MEMORY_FILE_STORAGE_SIZE = "MemoryFileStorageSize"
   val DIRECT_MEMORY_USAGE_RATIO = "DirectMemoryUsageRatio"
   val EVICTED_FILE_COUNT = "EvictedFileCount"

Reply via email to