jojochuang commented on code in PR #10609:
URL: https://github.com/apache/ozone/pull/10609#discussion_r3483152171


##########
hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - Disk 
Balancer.json:
##########
@@ -0,0 +1,358 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": true,
+        "enable": true,
+        "hide": true,
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "id": 2,
+      "title": "Disk Space Convergence Profile (Historical Before vs After)",
+      "type": "timeseries",
+      "description": "Shows how full each disk is over time. When the balancer 
is running, you should see these lines slowly move closer together until they 
meet in the middle.",
+      "gridPos": { "x": 0, "y": 0, "w": 12, "h": 7 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "targets": [
+        {
+          "expr": 
"{__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", 
instance=~\"$datanode.*\"}",
+          "legendFormat": "{{storagedirectory}}",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "decbytes",
+          "color": { "mode": "palette-classic" },
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "smooth",
+            "lineWidth": 2
+          }
+        }
+      }
+    },
+    {
+      "id": 3,
+      "title": "Current Disk Space Allocation Skew (Max - Min Volume Gap)",
+      "type": "bargauge",
+      "description": "The exact size of the data gap between your most full 
disk and your emptiest disk right now. The balancer's goal is to shrink this 
number down to zero.",
+      "gridPos": { "x": 12, "y": 0, "w": 12, "h": 7 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "targets": [
+        {
+          "expr": 
"max({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", 
instance=~\"$datanode.*\"}) by (instance) - 
min({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", 
instance=~\"$datanode.*\"}) by (instance)",
+          "instant": true,
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "decbytes",
+          "max": 5368709120
+        }
+      },
+      "options": {
+        "displayMode": "lcd",
+        "orientation": "horizontal",
+        "reduceOptions": { "calcs": ["lastNotNull"] }
+      }
+    },
+    {
+      "id": 5,
+      "title": "Active Balancing Throughput Speed",
+      "type": "stat",
+      "description": "The live speedometer. This shows exactly how fast data 
is physically copying from one disk to another across the node.",
+      "gridPos": { "x": 0, "y": 7, "w": 6, "h": 7 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "targets": [
+        {
+          "expr": 
"rate(disk_balancer_service_metrics_success_bytes{instance=~\"$datanode.*\"}[1m])",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "Bps"
+        }
+      },
+      "options": {
+        "graphMode": "area",
+        "textMode": "value",
+        "reduceOptions": { "calcs": ["lastNotNull"] }
+      }
+    },
+    {
+      "id": 7,
+      "title": "Unscheduled Target Backlog",
+      "type": "stat",
+      "description": "The remaining amount of data the balancer still needs to 
queue up to reach perfect balance.",
+      "gridPos": { "x": 6, "y": 7, "w": 8, "h": 7 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "targets": [
+        {
+          "expr": 
"max({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", 
instance=~\"$datanode.*\"}) by (instance) - 
avg({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", 
instance=~\"$datanode.*\"}) by (instance)",
+          "instant": true,
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "decbytes"
+        }
+      },
+      "options": {
+        "graphMode": "none",
+        "textMode": "value",
+        "reduceOptions": { "calcs": ["lastNotNull"] }
+      }
+    },
+    {
+      "id": 8,
+      "title": "Est. Time for Unscheduled Backlog",
+      "type": "stat",
+      "description": "A rough estimate of how many minutes are left before the 
balancer finishes balancing.",
+      "gridPos": { "x": 14, "y": 7, "w": 10, "h": 7 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "targets": [
+        {
+          "expr": 
"((sum(max({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", 
instance=~\"$datanode.*\"}) by (instance) - 
avg({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", 
instance=~\"$datanode.*\"}) by (instance)) / 
(sum(rate(disk_balancer_service_metrics_success_bytes{instance=~\"$datanode.*\"}[1m]))
 > 0)) / 60) or vector(0)",
+          "instant": true,
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "m"
+        }
+      },
+      "options": {
+        "graphMode": "none",
+        "textMode": "value",
+        "reduceOptions": { "calcs": ["lastNotNull"] }
+      }
+    },
+    {
+      "id": 6,
+      "title": "Container Migration Latency Profile (Processing Duration)",
+      "type": "timeseries",
+      "description": "How much time it takes the system to successfully copy, 
verify, and finalize a single container.",
+      "gridPos": { "x": 0, "y": 14, "w": 24, "h": 7 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "targets": [
+        {
+          "expr": 
"{__name__=~\"disk_balancer_service_metrics_copy_success_latency|disk_balancer_service_metrics_copy_failure_latency|disk_balancer_service_metrics_move_success_time_avg_time\",
 instance=~\"$datanode.*\"}",
+          "legendFormat": "{{__name__}}",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ms",
+          "color": { "mode": "palette-classic" }
+        }
+      }
+    },
+    {
+      "id": 11,
+      "title": "Overall Move Success Rate",
+      "type": "gauge",
+      "description": "The percentage of container moves that finished 
perfectly.",
+      "gridPos": { "x": 0, "y": 21, "w": 5, "h": 6 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "targets": [
+        {
+          "expr": "(sum(disk_balancer_service_metrics_success_count) / 
(sum(disk_balancer_service_metrics_success_count) + 
sum(disk_balancer_service_metrics_failure_count) + 0.00001)) * 100",
+          "instant": true,
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": 0, "color": "green" },
+              { "value": 80, "color": "red" }
+            ]
+          }
+        }
+      },
+      "options": {
+        "reduceOptions": { "calcs": ["lastNotNull"] }
+      }
+    },
+    {
+      "id": 12,
+      "title": "Total Lifetime Container Moves",
+      "type": "stat",
+      "description": "The total number of individual data blocks (containers) 
successfully relocated by the balancer.",
+      "gridPos": { "x": 5, "y": 21, "w": 4, "h": 6 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "targets": [
+        {
+          "expr": 
"max_over_time(disk_balancer_service_metrics_success_count{instance=~\"$datanode.*\"}[1h])",
+          "instant": true,
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "noValue": "0",
+          "unit": "short"
+        }
+      },
+      "options": {
+        "graphMode": "none",
+        "textMode": "value",
+        "reduceOptions": { "calcs": ["lastNotNull"] }
+      }
+    },
+    {
+      "id": 14,
+      "title": "Total Lifetime Data Balanced",
+      "type": "stat",
+      "description": "The total physical weight of all the data successfully 
shifted across the disks since the balancer started.",
+      "gridPos": { "x": 9, "y": 21, "w": 5, "h": 6 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "targets": [
+        {
+          "expr": 
"max_over_time(disk_balancer_service_metrics_success_bytes{instance=~\"$datanode.*\"}[1h])",
+          "instant": true,
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "noValue": "0",
+          "unit": "decbytes"
+        }
+      },
+      "options": {
+        "graphMode": "area",
+        "textMode": "auto",
+        "reduceOptions": { "calcs": ["lastNotNull"] }
+      }
+    },
+    {
+      "id": 15,
+      "title": "Failed Container Moves",
+      "type": "stat",
+      "description": "The number of times a move was aborted due to error. 
This should ideally stay at 0.",
+      "gridPos": { "x": 14, "y": 21, "w": 4, "h": 6 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "targets": [
+        {
+          "expr": 
"max_over_time(disk_balancer_service_metrics_failure_count{instance=~\"$datanode.*\"}[1h])",
+          "instant": true,
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "noValue": "0",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": 0, "color": "green" },
+              { "value": 1, "color": "red" }
+            ]
+          }
+        }
+      },
+      "options": {
+        "graphMode": "area",
+        "textMode": "auto",
+        "reduceOptions": { "calcs": ["lastNotNull"] }
+      }
+    },
+    {
+      "id": 13,
+      "title": "Balancer Idle Reason",
+      "type": "piechart",
+      "description": "Shows why the balancer is taking a break. 'Target 
Reached' means the disks are already balanced. 'Throttled' means it's slowing 
down on purpose so it doesn't overload the server.",
+      "gridPos": { "x": 18, "y": 21, "w": 6, "h": 6 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "targets": [
+        {
+          "expr": 
"sum(increase(disk_balancer_service_metrics_idle_loop_no_available_volume_pair_count[15m]))",
+          "instant": true,
+          "legendFormat": "Target Reached",
+          "refId": "A"
+        },
+        {
+          "expr": 
"sum(increase(disk_balancer_service_metrics_idle_loop_exceeds_bandwidth_count[15m]))",
+          "instant": true,
+          "legendFormat": "Throttled",
+          "refId": "B"
+        }
+      ],
+      "options": {
+        "pieType": "donut",
+        "legend": { "displayMode": "list", "placement": "bottom", 
"showLegend": true },
+        "reduceOptions": { "calcs": ["lastNotNull"] }
+      }
+    }
+  ],
+  "refresh": "5s",
+  "schemaVersion": 38,
+  "style": "dark",
+  "tags": ["Ozone", "DiskBalancer"],
+  "templating": {
+    "list": [
+      {
+        "current": {},
+        "hide": 0,
+        "includeAll": false,
+        "label": "Data Source",
+        "multi": false,
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "skipUrlSync": false,
+        "type": "datasource"
+      },
+      {
+        "current": {},
+        "datasource": { "type": "prometheus", "uid": "${datasource}" },
+        "definition": 
"label_values(disk_balancer_service_metrics_volume_data_density, instance)",
+        "hide": 0,
+        "includeAll": true,
+        "multi": true,
+        "name": "datanode",
+        "options": [],
+        "query": {
+          "query": 
"label_values(disk_balancer_service_metrics_volume_data_density, instance)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-5m",
+    "to": "now"
+  },
+  "timezone": "browser",
+  "title": "Ozone Disk Balancer Operations Matrix",

Review Comment:
   "Ozone Disk Balancer Operations"



##########
hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - Disk 
Balancer.json:
##########
@@ -0,0 +1,358 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": true,
+        "enable": true,
+        "hide": true,
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "id": 2,
+      "title": "Disk Space Convergence Profile (Historical Before vs After)",
+      "type": "timeseries",
+      "description": "Shows how full each disk is over time. When the balancer 
is running, you should see these lines slowly move closer together until they 
meet in the middle.",
+      "gridPos": { "x": 0, "y": 0, "w": 12, "h": 7 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "targets": [
+        {
+          "expr": 
"{__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", 
instance=~\"$datanode.*\"}",

Review Comment:
   ```suggestion
             "expr": "{__name__=~\"volume_info_metrics_.*_ozone_used\"}",
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to