navinko commented on code in PR #10609:
URL: https://github.com/apache/ozone/pull/10609#discussion_r3487434639
##########
hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - Disk
Balancer.json:
##########
@@ -0,0 +1,358 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": true,
+ "enable": true,
+ "hide": true,
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 1,
+ "id": null,
+ "links": [],
+ "liveNow": false,
+ "panels": [
+ {
+ "id": 2,
+ "title": "Disk Space Convergence Profile (Historical Before vs After)",
+ "type": "timeseries",
+ "description": "Shows how full each disk is over time. When the balancer
is running, you should see these lines slowly move closer together until they
meet in the middle.",
+ "gridPos": { "x": 0, "y": 0, "w": 12, "h": 7 },
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
+ "targets": [
+ {
+ "expr":
"{__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\",
instance=~\"$datanode.*\"}",
+ "legendFormat": "{{storagedirectory}}",
+ "refId": "A"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "decbytes",
+ "color": { "mode": "palette-classic" },
+ "custom": {
+ "drawStyle": "line",
+ "lineInterpolation": "smooth",
+ "lineWidth": 2
+ }
+ }
+ }
+ },
+ {
+ "id": 3,
+ "title": "Current Disk Space Allocation Skew (Max - Min Volume Gap)",
+ "type": "bargauge",
+ "description": "The exact size of the data gap between your most full
disk and your emptiest disk right now. The balancer's goal is to shrink this
number down to zero.",
+ "gridPos": { "x": 12, "y": 0, "w": 12, "h": 7 },
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
+ "targets": [
+ {
+ "expr":
"max({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\",
instance=~\"$datanode.*\"}) by (instance) -
min({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\",
instance=~\"$datanode.*\"}) by (instance)",
+ "instant": true,
+ "refId": "A"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "decbytes",
+ "max": 5368709120
+ }
+ },
+ "options": {
+ "displayMode": "lcd",
+ "orientation": "horizontal",
+ "reduceOptions": { "calcs": ["lastNotNull"] }
+ }
+ },
+ {
+ "id": 5,
+ "title": "Active Balancing Throughput Speed",
+ "type": "stat",
+ "description": "The live speedometer. This shows exactly how fast data
is physically copying from one disk to another across the node.",
+ "gridPos": { "x": 0, "y": 7, "w": 6, "h": 7 },
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
+ "targets": [
+ {
+ "expr":
"rate(disk_balancer_service_metrics_success_bytes{instance=~\"$datanode.*\"}[1m])",
+ "refId": "A"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "Bps"
+ }
+ },
+ "options": {
+ "graphMode": "area",
+ "textMode": "value",
+ "reduceOptions": { "calcs": ["lastNotNull"] }
+ }
+ },
+ {
+ "id": 7,
+ "title": "Unscheduled Target Backlog",
+ "type": "stat",
+ "description": "The remaining amount of data the balancer still needs to
queue up to reach perfect balance.",
+ "gridPos": { "x": 6, "y": 7, "w": 8, "h": 7 },
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
+ "targets": [
+ {
+ "expr":
"max({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\",
instance=~\"$datanode.*\"}) by (instance) -
avg({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\",
instance=~\"$datanode.*\"}) by (instance)",
+ "instant": true,
+ "refId": "A"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "decbytes"
+ }
+ },
+ "options": {
+ "graphMode": "none",
+ "textMode": "value",
+ "reduceOptions": { "calcs": ["lastNotNull"] }
+ }
+ },
+ {
+ "id": 8,
+ "title": "Est. Time for Unscheduled Backlog",
+ "type": "stat",
+ "description": "A rough estimate of how many minutes are left before the
balancer finishes balancing.",
+ "gridPos": { "x": 14, "y": 7, "w": 10, "h": 7 },
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
+ "targets": [
+ {
+ "expr":
"((sum(max({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\",
instance=~\"$datanode.*\"}) by (instance) -
avg({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\",
instance=~\"$datanode.*\"}) by (instance)) /
(sum(rate(disk_balancer_service_metrics_success_bytes{instance=~\"$datanode.*\"}[1m]))
> 0)) / 60) or vector(0)",
+ "instant": true,
+ "refId": "A"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "m"
+ }
+ },
+ "options": {
+ "graphMode": "none",
+ "textMode": "value",
+ "reduceOptions": { "calcs": ["lastNotNull"] }
+ }
+ },
+ {
+ "id": 6,
+ "title": "Container Migration Latency Profile (Processing Duration)",
+ "type": "timeseries",
+ "description": "How much time it takes the system to successfully copy,
verify, and finalize a single container.",
+ "gridPos": { "x": 0, "y": 14, "w": 24, "h": 7 },
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
+ "targets": [
+ {
+ "expr":
"{__name__=~\"disk_balancer_service_metrics_copy_success_latency|disk_balancer_service_metrics_copy_failure_latency|disk_balancer_service_metrics_move_success_time_avg_time\",
instance=~\"$datanode.*\"}",
+ "legendFormat": "{{__name__}}",
+ "refId": "A"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ms",
+ "color": { "mode": "palette-classic" }
+ }
+ }
+ },
+ {
+ "id": 11,
+ "title": "Overall Move Success Rate",
+ "type": "gauge",
+ "description": "The percentage of container moves that finished
perfectly.",
+ "gridPos": { "x": 0, "y": 21, "w": 5, "h": 6 },
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
+ "targets": [
+ {
+ "expr": "(sum(disk_balancer_service_metrics_success_count) /
(sum(disk_balancer_service_metrics_success_count) +
sum(disk_balancer_service_metrics_failure_count) + 0.00001)) * 100",
+ "instant": true,
+ "refId": "A"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "percent",
+ "min": 0,
+ "max": 100,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": 0, "color": "green" },
+ { "value": 80, "color": "red" }
+ ]
+ }
+ }
+ },
+ "options": {
+ "reduceOptions": { "calcs": ["lastNotNull"] }
+ }
+ },
+ {
+ "id": 12,
+ "title": "Total Lifetime Container Moves",
+ "type": "stat",
+ "description": "The total number of individual data blocks (containers)
successfully relocated by the balancer.",
+ "gridPos": { "x": 5, "y": 21, "w": 4, "h": 6 },
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
+ "targets": [
+ {
+ "expr":
"max_over_time(disk_balancer_service_metrics_success_count{instance=~\"$datanode.*\"}[1h])",
+ "instant": true,
+ "refId": "A"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "unit": "short"
+ }
+ },
+ "options": {
+ "graphMode": "none",
+ "textMode": "value",
+ "reduceOptions": { "calcs": ["lastNotNull"] }
+ }
+ },
+ {
+ "id": 14,
+ "title": "Total Lifetime Data Balanced",
+ "type": "stat",
+ "description": "The total physical weight of all the data successfully
shifted across the disks since the balancer started.",
+ "gridPos": { "x": 9, "y": 21, "w": 5, "h": 6 },
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
+ "targets": [
+ {
+ "expr":
"max_over_time(disk_balancer_service_metrics_success_bytes{instance=~\"$datanode.*\"}[1h])",
+ "instant": true,
+ "refId": "A"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "unit": "decbytes"
+ }
+ },
+ "options": {
+ "graphMode": "area",
+ "textMode": "auto",
+ "reduceOptions": { "calcs": ["lastNotNull"] }
+ }
+ },
+ {
+ "id": 15,
+ "title": "Failed Container Moves",
+ "type": "stat",
+ "description": "The number of times a move was aborted due to error.
This should ideally stay at 0.",
+ "gridPos": { "x": 14, "y": 21, "w": 4, "h": 6 },
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
+ "targets": [
+ {
+ "expr":
"max_over_time(disk_balancer_service_metrics_failure_count{instance=~\"$datanode.*\"}[1h])",
+ "instant": true,
+ "refId": "A"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "noValue": "0",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ { "value": 0, "color": "green" },
+ { "value": 1, "color": "red" }
+ ]
+ }
+ }
+ },
+ "options": {
+ "graphMode": "area",
+ "textMode": "auto",
+ "reduceOptions": { "calcs": ["lastNotNull"] }
+ }
+ },
+ {
+ "id": 13,
+ "title": "Balancer Idle Reason",
+ "type": "piechart",
+ "description": "Shows why the balancer is taking a break. 'Target
Reached' means the disks are already balanced. 'Throttled' means it's slowing
down on purpose so it doesn't overload the server.",
+ "gridPos": { "x": 18, "y": 21, "w": 6, "h": 6 },
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
+ "targets": [
+ {
+ "expr":
"sum(increase(disk_balancer_service_metrics_idle_loop_no_available_volume_pair_count[15m]))",
+ "instant": true,
+ "legendFormat": "Target Reached",
+ "refId": "A"
+ },
+ {
+ "expr":
"sum(increase(disk_balancer_service_metrics_idle_loop_exceeds_bandwidth_count[15m]))",
+ "instant": true,
+ "legendFormat": "Throttled",
+ "refId": "B"
+ }
+ ],
+ "options": {
+ "pieType": "donut",
+ "legend": { "displayMode": "list", "placement": "bottom",
"showLegend": true },
+ "reduceOptions": { "calcs": ["lastNotNull"] }
+ }
+ }
+ ],
+ "refresh": "5s",
+ "schemaVersion": 38,
+ "style": "dark",
+ "tags": ["Ozone", "DiskBalancer"],
+ "templating": {
+ "list": [
+ {
+ "current": {},
+ "hide": 0,
+ "includeAll": false,
+ "label": "Data Source",
+ "multi": false,
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "current": {},
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
+ "definition":
"label_values(disk_balancer_service_metrics_volume_data_density, instance)",
+ "hide": 0,
+ "includeAll": true,
+ "multi": true,
+ "name": "datanode",
+ "options": [],
+ "query": {
+ "query":
"label_values(disk_balancer_service_metrics_volume_data_density, instance)",
+ "refId": "StandardVariableQuery"
+ },
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "type": "query"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-5m",
+ "to": "now"
+ },
+ "timezone": "browser",
+ "title": "Ozone Disk Balancer Operations Matrix",
Review Comment:
updated
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]