This is an automated email from the ASF dual-hosted git repository.
nicholasjiang pushed a commit to branch branch-0.6
in repository https://gitbox.apache.org/repos/asf/celeborn.git
The following commit(s) were added to refs/heads/branch-0.6 by this push:
new 834fb1a50 [CELEBORN-2102] Introduce SorterCacheHitRate metric to
monitor the hit reate of index cache for sorter
834fb1a50 is described below
commit 834fb1a506ed514298f6a74f75553bf6095a1bbe
Author: dz <[email protected]>
AuthorDate: Wed Aug 20 10:47:38 2025 +0800
[CELEBORN-2102] Introduce SorterCacheHitRate metric to monitor the hit
reate of index cache for sorter
### What changes were proposed in this pull request?
Introduce `SorterCacheHitRate` metric to monitor the hit reate of index
cache for sorter.
### Why are the changes needed?
Monitor the hit rate of `PartitionFilesSorter#indexCache`.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
The verified grafana dashboard:
https://xy2953396112.grafana.net/public-dashboards/5d1177ee0f784b53ad817fde919141b7
Closes #3416 from xy2953396112/CELEBORN_2102.
Authored-by: dz <[email protected]>
Signed-off-by: SteNicholas <[email protected]>
(cherry picked from commit 11b41f97ad19d77fe05927c75f728a549e8fd382)
Signed-off-by: SteNicholas <[email protected]>
---
assets/grafana/celeborn-dashboard.json | 93 ++++++++++++++++++++++
docs/monitoring.md | 1 +
.../worker/storage/PartitionFilesSorter.java | 2 +
.../service/deploy/worker/WorkerSource.scala | 1 +
4 files changed, 97 insertions(+)
diff --git a/assets/grafana/celeborn-dashboard.json
b/assets/grafana/celeborn-dashboard.json
index 85e4af319..5480e58ec 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -12238,6 +12238,99 @@
"title": "metrics_SortedFileSize_Value",
"type": "timeseries"
},
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 89
+ },
+ "id": 156,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "hideZeros": false,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "12.1.0-91295",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr":
"metrics_SorterCacheHitRate_Value{instance=~\"${instance}\"}",
+ "legendFormat": "${baseLegend}",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_SorterCacheHitRate_Value",
+ "type": "timeseries"
+ },
{
"datasource": {
"type": "prometheus",
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 89558cb42..7a3d5ef5f 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -239,6 +239,7 @@ These metrics are exposed by Celeborn worker.
| ReadBufferAllocatedCount | Allocated read buffer count.
|
| ActiveCreditStreamCount | Active stream count for map
partition reading streams.
|
| ActiveMapPartitionCount | The count of active map
partition reading streams.
|
+ | SorterCacheHitRate | The cache hit rate for worker
partition sorter index.
|
| CleanTaskQueueSize | The count of task for cleaning
up expired shuffle keys.
|
| CleanExpiredShuffleKeysTime | The time for a worker to clean
up shuffle data of expired shuffle keys.
|
| DeviceOSFreeBytes | The actual usable space of OS
for device monitor.
|
diff --git
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java
index a45fa2e23..a2c833ee8 100644
---
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java
+++
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java
@@ -145,7 +145,9 @@ public class PartitionFilesSorter extends
ShuffleRecoverHelper {
(key, cache) ->
((Map<Integer, List<ShuffleBlockInfo>>) cache)
.values().stream().mapToInt(List::size).sum())
+ .recordStats()
.build();
+ source.addGauge(WorkerSource.SORTER_CACHE_HIT_RATE(), () ->
indexCache.stats().hitRate());
fileSorterSchedulerThread =
ThreadUtils.newDaemonSingleThreadExecutor("worker-file-sorter-scheduler");
diff --git
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index f4990d650..4c9715c35 100644
---
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -229,6 +229,7 @@ object WorkerSource {
val PENDING_SORT_TASKS = "PendingSortTasks"
val SORTED_FILES = "SortedFiles"
val SORTED_FILE_SIZE = "SortedFileSize"
+ val SORTER_CACHE_HIT_RATE = "SorterCacheHitRate"
val DISK_BUFFER = "DiskBuffer"
val BUFFER_STREAM_READ_BUFFER = "BufferStreamReadBuffer"
val READ_BUFFER_DISPATCHER_REQUESTS_LENGTH =
"ReadBufferDispatcherRequestsLength"