This is an automated email from the ASF dual-hosted git repository.
nicholasjiang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new e5cef1636 [CELEBORN-2209] Introduce ReadBufferUsageRatio metric to
monitor credit stream read buffer usage
e5cef1636 is described below
commit e5cef163646ff4592d981aad8eb714ccd3833414
Author: SteNicholas <[email protected]>
AuthorDate: Mon Dec 1 12:06:40 2025 +0800
[CELEBORN-2209] Introduce ReadBufferUsageRatio metric to monitor credit
stream read buffer usage
### What changes were proposed in this pull request?
Introduce `ReadBufferUsageRatio` metric to monitor credit stream read
buffer usage.
### Why are the changes needed?
`BufferStreamReadBuffer` metric is used to monitor the memory used by
credit stream read buffer, which is not enough to monitor ratio of credit
stream read buffer used and max direct memory.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
CI and grafana manual test with [celeborn
dashboard](https://stenicholas.grafana.net/public-dashboards/12f47ac2ba2f4c0c88f761f98ffcf51c).
Closes #3545 from SteNicholas/CELEBORN-2209.
Authored-by: SteNicholas <[email protected]>
Signed-off-by: SteNicholas <[email protected]>
---
assets/grafana/celeborn-dashboard.json | 95 ++++++++++++++++++++++
docs/monitoring.md | 3 +-
.../deploy/worker/memory/MemoryManager.java | 4 +
.../celeborn/service/deploy/worker/Worker.scala | 3 +
.../service/deploy/worker/WorkerSource.scala | 1 +
5 files changed, 105 insertions(+), 1 deletion(-)
diff --git a/assets/grafana/celeborn-dashboard.json
b/assets/grafana/celeborn-dashboard.json
index 0dba8a844..2e62facaa 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -12710,6 +12710,101 @@
],
"title": "AvailableReadBuffer",
"type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "The ratio of read buffer used and max direct
memory.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 35
+ }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 289
+ },
+ "id": 109,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "builder",
+ "expr":
"metrics_ReadBufferUsageRatio_Value{instance=~\"${instance}\"}",
+ "instant": false,
+ "legendFormat": "${baseLegend}",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "ReadBufferUsageRatio",
+ "type": "timeseries"
}
],
"title": "MemoryRelatives",
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 18bb29c8f..4c7d4adfa 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -244,7 +244,8 @@ These metrics are exposed by Celeborn worker.
| BufferStreamReadBuffer | The memory used by credit
stream read buffer.
|
| ReadBufferDispatcherRequestsLength | The queue size of read buffer
allocation requests.
|
| ReadBufferAllocatedCount | Allocated read buffer count.
|
- - | AvailableReadBuffer | The available memory for credit
stream read buffer.
|
+ | AvailableReadBuffer | The available memory for credit
stream read buffer.
|
+ | ReadBufferUsageRatio | Ratio of credit stream read
buffer used and max direct memory.
|
| ActiveCreditStreamCount | Active stream count for map
partition reading streams.
|
| ActiveMapPartitionCount | The count of active map
partition reading streams.
|
| SorterCacheHitRate | The cache hit rate for worker
partition sorter index.
|
diff --git
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/memory/MemoryManager.java
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/memory/MemoryManager.java
index de17c4549..4920a5ce4 100644
---
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/memory/MemoryManager.java
+++
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/memory/MemoryManager.java
@@ -506,6 +506,10 @@ public class MemoryManager {
return Math.max(0, readBufferThreshold - readBufferCounter.get());
}
+ public double readBufferUsageRatio() {
+ return readBufferCounter.get() / (double) readBufferThreshold;
+ }
+
public long getPausePushDataAndReplicateCounter() {
return pausePushDataAndReplicateCounter.sum();
}
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
index 2ba3ba56a..65e0779c9 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
@@ -409,6 +409,9 @@ private[celeborn] class Worker(
workerSource.addGauge(WorkerSource.AVAILABLE_READ_BUFFER) { () =>
memoryManager.availableReadBuffer
}
+ workerSource.addGauge(WorkerSource.READ_BUFFER_USAGE_RATIO) { () =>
+ memoryManager.readBufferUsageRatio
+ }
workerSource.addGauge(WorkerSource.MEMORY_FILE_STORAGE_SIZE) { () =>
memoryManager.getMemoryFileStorageCounter
}
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index f3f5446cc..7d2467333 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -243,6 +243,7 @@ object WorkerSource {
val READ_BUFFER_DISPATCHER_REQUESTS_LENGTH =
"ReadBufferDispatcherRequestsLength"
val READ_BUFFER_ALLOCATED_COUNT = "ReadBufferAllocatedCount"
val AVAILABLE_READ_BUFFER = "AvailableReadBuffer"
+ val READ_BUFFER_USAGE_RATIO = "ReadBufferUsageRatio"
val MEMORY_FILE_STORAGE_SIZE = "MemoryFileStorageSize"
val DIRECT_MEMORY_USAGE_RATIO = "DirectMemoryUsageRatio"
val EVICTED_FILE_COUNT = "EvictedFileCount"