This is an automated email from the ASF dual-hosted git repository.

nicholasjiang pushed a commit to branch branch-0.6
in repository https://gitbox.apache.org/repos/asf/celeborn.git

commit e743cae1c00022fa9acd61ddbc323c280d9a57c0
Author: dz <[email protected]>
AuthorDate: Wed Aug 20 10:47:38 2025 +0800

    [CELEBORN-2102] Introduce SorterCacheHitRate metric to monitor the hit 
reate of index cache for sorter
    
    ### What changes were proposed in this pull request?
    
    Introduce `SorterCacheHitRate` metric to monitor the hit reate of index 
cache for sorter.
    
    ### Why are the changes needed?
    
    Monitor the hit rate of `PartitionFilesSorter#indexCache`.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    The verified grafana dashboard: 
https://xy2953396112.grafana.net/public-dashboards/5d1177ee0f784b53ad817fde919141b7
    
    Closes #3416 from xy2953396112/CELEBORN_2102.
    
    Authored-by: dz <[email protected]>
    Signed-off-by: SteNicholas <[email protected]>
    (cherry picked from commit 11b41f97ad19d77fe05927c75f728a549e8fd382)
    Signed-off-by: SteNicholas <[email protected]>
---
 assets/grafana/celeborn-dashboard.json             | 93 ++++++++++++++++++++++
 docs/monitoring.md                                 |  1 +
 .../worker/storage/PartitionFilesSorter.java       |  2 +
 .../service/deploy/worker/WorkerSource.scala       |  1 +
 4 files changed, 97 insertions(+)

diff --git a/assets/grafana/celeborn-dashboard.json 
b/assets/grafana/celeborn-dashboard.json
index 85e4af319..5480e58ec 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -12238,6 +12238,99 @@
       "title": "metrics_SortedFileSize_Value",
       "type": "timeseries"
     },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              }
+            ]
+          },
+          "unit": "decbytes"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 89
+      },
+      "id": 156,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.1.0-91295",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": 
"metrics_SorterCacheHitRate_Value{instance=~\"${instance}\"}",
+          "legendFormat": "${baseLegend}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "metrics_SorterCacheHitRate_Value",
+      "type": "timeseries"
+    },
     {
       "datasource": {
         "type": "prometheus",
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 89558cb42..7a3d5ef5f 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -239,6 +239,7 @@ These metrics are exposed by Celeborn worker.
     | ReadBufferAllocatedCount               | Allocated read buffer count.    
                                                                                
|
     | ActiveCreditStreamCount                | Active stream count for map 
partition reading streams.                                                      
    |
     | ActiveMapPartitionCount                | The count of active map 
partition reading streams.                                                      
        |
+    | SorterCacheHitRate                     | The cache hit rate for worker 
partition sorter index.                                                         
  |
     | CleanTaskQueueSize                     | The count of task for cleaning 
up expired shuffle keys.                                                        
 |
     | CleanExpiredShuffleKeysTime            | The time for a worker to clean 
up shuffle data of expired shuffle keys.                                        
 |
     | DeviceOSFreeBytes                      | The actual usable space of OS 
for device monitor.                                                             
  |
diff --git 
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java
 
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java
index a45fa2e23..a2c833ee8 100644
--- 
a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java
+++ 
b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java
@@ -145,7 +145,9 @@ public class PartitionFilesSorter extends 
ShuffleRecoverHelper {
                 (key, cache) ->
                     ((Map<Integer, List<ShuffleBlockInfo>>) cache)
                         .values().stream().mapToInt(List::size).sum())
+            .recordStats()
             .build();
+    source.addGauge(WorkerSource.SORTER_CACHE_HIT_RATE(), () -> 
indexCache.stats().hitRate());
 
     fileSorterSchedulerThread =
         
ThreadUtils.newDaemonSingleThreadExecutor("worker-file-sorter-scheduler");
diff --git 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index f4990d650..4c9715c35 100644
--- 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++ 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -229,6 +229,7 @@ object WorkerSource {
   val PENDING_SORT_TASKS = "PendingSortTasks"
   val SORTED_FILES = "SortedFiles"
   val SORTED_FILE_SIZE = "SortedFileSize"
+  val SORTER_CACHE_HIT_RATE = "SorterCacheHitRate"
   val DISK_BUFFER = "DiskBuffer"
   val BUFFER_STREAM_READ_BUFFER = "BufferStreamReadBuffer"
   val READ_BUFFER_DISPATCHER_REQUESTS_LENGTH = 
"ReadBufferDispatcherRequestsLength"

Reply via email to