This is an automated email from the ASF dual-hosted git repository. nicholasjiang pushed a commit to branch branch-0.6 in repository https://gitbox.apache.org/repos/asf/celeborn.git
commit e743cae1c00022fa9acd61ddbc323c280d9a57c0 Author: dz <[email protected]> AuthorDate: Wed Aug 20 10:47:38 2025 +0800 [CELEBORN-2102] Introduce SorterCacheHitRate metric to monitor the hit reate of index cache for sorter ### What changes were proposed in this pull request? Introduce `SorterCacheHitRate` metric to monitor the hit reate of index cache for sorter. ### Why are the changes needed? Monitor the hit rate of `PartitionFilesSorter#indexCache`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? The verified grafana dashboard: https://xy2953396112.grafana.net/public-dashboards/5d1177ee0f784b53ad817fde919141b7 Closes #3416 from xy2953396112/CELEBORN_2102. Authored-by: dz <[email protected]> Signed-off-by: SteNicholas <[email protected]> (cherry picked from commit 11b41f97ad19d77fe05927c75f728a549e8fd382) Signed-off-by: SteNicholas <[email protected]> --- assets/grafana/celeborn-dashboard.json | 93 ++++++++++++++++++++++ docs/monitoring.md | 1 + .../worker/storage/PartitionFilesSorter.java | 2 + .../service/deploy/worker/WorkerSource.scala | 1 + 4 files changed, 97 insertions(+) diff --git a/assets/grafana/celeborn-dashboard.json b/assets/grafana/celeborn-dashboard.json index 85e4af319..5480e58ec 100644 --- a/assets/grafana/celeborn-dashboard.json +++ b/assets/grafana/celeborn-dashboard.json @@ -12238,6 +12238,99 @@ "title": "metrics_SortedFileSize_Value", "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 89 + }, + "id": 156, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0-91295", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "metrics_SorterCacheHitRate_Value{instance=~\"${instance}\"}", + "legendFormat": "${baseLegend}", + "range": true, + "refId": "A" + } + ], + "title": "metrics_SorterCacheHitRate_Value", + "type": "timeseries" + }, { "datasource": { "type": "prometheus", diff --git a/docs/monitoring.md b/docs/monitoring.md index 89558cb42..7a3d5ef5f 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -239,6 +239,7 @@ These metrics are exposed by Celeborn worker. | ReadBufferAllocatedCount | Allocated read buffer count. | | ActiveCreditStreamCount | Active stream count for map partition reading streams. | | ActiveMapPartitionCount | The count of active map partition reading streams. | + | SorterCacheHitRate | The cache hit rate for worker partition sorter index. | | CleanTaskQueueSize | The count of task for cleaning up expired shuffle keys. | | CleanExpiredShuffleKeysTime | The time for a worker to clean up shuffle data of expired shuffle keys. | | DeviceOSFreeBytes | The actual usable space of OS for device monitor. | diff --git a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java index a45fa2e23..a2c833ee8 100644 --- a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java +++ b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionFilesSorter.java @@ -145,7 +145,9 @@ public class PartitionFilesSorter extends ShuffleRecoverHelper { (key, cache) -> ((Map<Integer, List<ShuffleBlockInfo>>) cache) .values().stream().mapToInt(List::size).sum()) + .recordStats() .build(); + source.addGauge(WorkerSource.SORTER_CACHE_HIT_RATE(), () -> indexCache.stats().hitRate()); fileSorterSchedulerThread = ThreadUtils.newDaemonSingleThreadExecutor("worker-file-sorter-scheduler"); diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala index f4990d650..4c9715c35 100644 --- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala +++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala @@ -229,6 +229,7 @@ object WorkerSource { val PENDING_SORT_TASKS = "PendingSortTasks" val SORTED_FILES = "SortedFiles" val SORTED_FILE_SIZE = "SortedFileSize" + val SORTER_CACHE_HIT_RATE = "SorterCacheHitRate" val DISK_BUFFER = "DiskBuffer" val BUFFER_STREAM_READ_BUFFER = "BufferStreamReadBuffer" val READ_BUFFER_DISPATCHER_REQUESTS_LENGTH = "ReadBufferDispatcherRequestsLength"
