This is an automated email from the ASF dual-hosted git repository. tingchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push: new 9eaa3a1364 Add metrics for SEGMENTS_WITH_LESS_REPLICAS monitoring (#12336) 9eaa3a1364 is described below commit 9eaa3a1364b0ad8882b83b8d692c161c19ad31a0 Author: lnbest0707-uber <106711887+lnbest0707-u...@users.noreply.github.com> AuthorDate: Fri Feb 16 11:13:25 2024 -0800 Add metrics for SEGMENTS_WITH_LESS_REPLICAS monitoring (#12336) * Add metrics for no-HA segments monitoring Summary: Add metrics to monitor any segments running with only one replica. This could help us monitor the reliability risk during node replacement. * Fix UT * Track nIdeal - 1 replicas instead of 1 replica * Improve log message * Improve variable naming --- .../apache/pinot/common/metrics/ControllerGauge.java | 3 +++ .../pinot/controller/helix/SegmentStatusChecker.java | 19 ++++++++++++++++--- .../controller/helix/SegmentStatusCheckerTest.java | 16 ++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java b/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java index ca8c141447..82e86c55a9 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java @@ -40,6 +40,9 @@ public enum ControllerGauge implements AbstractMetrics.Gauge { // ideal state PERCENT_SEGMENTS_AVAILABLE("segments", false), + // Number of segments running with less than expected replicas in external view + SEGMENTS_WITH_LESS_REPLICAS("segments", false), + SEGMENT_COUNT("SegmentCount", false), // Number of segments including the replaced segments which are specified in the segment lineage entries and cannot diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java index f4121506a1..5b543e4319 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java @@ -242,6 +242,7 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh int nReplicasExternal = -1; // Keeps track of minimum number of replicas in external view int nErrors = 0; // Keeps track of number of segments in error state int nOffline = 0; // Keeps track of number segments with no online replicas + int nNumOfReplicasLessThanIdeal = 0; // Keeps track of number of segments running with less than expected replicas int nSegments = 0; // Counts number of segments long tableCompressedSize = 0; // Tracks the total compressed segment size in deep store per table for (String partitionName : segmentsExcludeReplaced) { @@ -303,6 +304,10 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh LOGGER.warn("Segment {} of table {} has no online replicas", partitionName, tableNameWithType); } nOffline++; + } else if (nReplicas < nReplicasIdealMax) { + LOGGER.debug("Segment {} of table {} is running with {} replicas which is less than the expected values {}", + partitionName, tableNameWithType, nReplicas, nReplicasIdealMax); + nNumOfReplicasLessThanIdeal++; } nReplicasExternal = ((nReplicasExternal > nReplicas) || (nReplicasExternal == -1)) ? nReplicas : nReplicasExternal; @@ -315,6 +320,8 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh _controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.PERCENT_OF_REPLICAS, (nReplicasIdealMax > 0) ? (nReplicasExternal * 100 / nReplicasIdealMax) : 100); _controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.SEGMENTS_IN_ERROR_STATE, nErrors); + _controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS, + nNumOfReplicasLessThanIdeal); _controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE, (nSegments > 0) ? (nSegments - nOffline) * 100 / nSegments : 100); _controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.TABLE_COMPRESSED_SIZE, @@ -323,9 +330,13 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh if (nOffline > 0) { LOGGER.warn("Table {} has {} segments with no online replicas", tableNameWithType, nOffline); } + if (nNumOfReplicasLessThanIdeal > 0) { + LOGGER.warn("Table {} has {} segments with number of replicas less than the replication factor", + tableNameWithType, nNumOfReplicasLessThanIdeal); + } if (nReplicasExternal < nReplicasIdealMax) { - LOGGER.warn("Table {} has {} replicas, below replication threshold :{}", tableNameWithType, nReplicasExternal, - nReplicasIdealMax); + LOGGER.warn("Table {} has at least one segment running with only {} replicas, below replication threshold :{}", + tableNameWithType, nReplicasExternal, nReplicasIdealMax); } if (tableType == TableType.REALTIME && tableConfig != null) { @@ -346,13 +357,13 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.NUMBER_OF_REPLICAS); _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.PERCENT_OF_REPLICAS); _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE); - _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.IDEALSTATE_ZNODE_SIZE); _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.IDEALSTATE_ZNODE_BYTE_SIZE); _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.SEGMENT_COUNT); _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.SEGMENT_COUNT_INCLUDING_REPLACED); _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.SEGMENTS_IN_ERROR_STATE); + _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS); _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE); _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.TABLE_DISABLED); _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.TABLE_CONSUMPTION_PAUSED); @@ -371,6 +382,8 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh _controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.NUMBER_OF_REPLICAS, Long.MIN_VALUE); _controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.PERCENT_OF_REPLICAS, Long.MIN_VALUE); _controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.SEGMENTS_IN_ERROR_STATE, Long.MIN_VALUE); + _controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS, + Long.MIN_VALUE); _controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE, Long.MIN_VALUE); } diff --git a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/SegmentStatusCheckerTest.java b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/SegmentStatusCheckerTest.java index 731f1f33d5..99991b3d4c 100644 --- a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/SegmentStatusCheckerTest.java +++ b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/SegmentStatusCheckerTest.java @@ -163,6 +163,8 @@ public class SegmentStatusCheckerTest { ControllerGauge.SEGMENT_COUNT_INCLUDING_REPLACED), 5); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), ControllerGauge.SEGMENTS_IN_ERROR_STATE), 1); + Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), + ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 2); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), ControllerGauge.NUMBER_OF_REPLICAS), 2); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), @@ -248,6 +250,8 @@ public class SegmentStatusCheckerTest { ControllerGauge.REPLICATION_FROM_CONFIG), 3); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), ControllerGauge.SEGMENTS_IN_ERROR_STATE), 0); + Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), + ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 0); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), ControllerGauge.NUMBER_OF_REPLICAS), 3); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), @@ -346,6 +350,8 @@ public class SegmentStatusCheckerTest { _segmentStatusChecker.run(); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), ControllerGauge.SEGMENTS_IN_ERROR_STATE), 1); + Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), + ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 2); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), ControllerGauge.NUMBER_OF_REPLICAS), 0); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), @@ -403,6 +409,8 @@ public class SegmentStatusCheckerTest { _segmentStatusChecker.run(); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, ControllerGauge.SEGMENTS_IN_ERROR_STATE), 0); + Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, + ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 0); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, ControllerGauge.NUMBER_OF_REPLICAS), 0); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, @@ -446,6 +454,8 @@ public class SegmentStatusCheckerTest { _segmentStatusChecker.run(); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, ControllerGauge.SEGMENTS_IN_ERROR_STATE), Long.MIN_VALUE); + Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, + ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), Long.MIN_VALUE); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, ControllerGauge.NUMBER_OF_REPLICAS), Long.MIN_VALUE); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, @@ -538,6 +548,8 @@ public class SegmentStatusCheckerTest { _segmentStatusChecker.run(); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), ControllerGauge.SEGMENTS_IN_ERROR_STATE), 0); + Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), + ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 0); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), ControllerGauge.NUMBER_OF_REPLICAS), 2); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(), @@ -593,6 +605,8 @@ public class SegmentStatusCheckerTest { _segmentStatusChecker.run(); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, ControllerGauge.SEGMENTS_IN_ERROR_STATE), 0); + Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, + ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 0); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, ControllerGauge.NUMBER_OF_REPLICAS), 1); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, @@ -806,6 +820,8 @@ public class SegmentStatusCheckerTest { _segmentStatusChecker.start(); _segmentStatusChecker.run(); + Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, + ControllerGauge.SEGMENTS_IN_ERROR_STATE), Long.MIN_VALUE); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, ControllerGauge.SEGMENTS_IN_ERROR_STATE), Long.MIN_VALUE); Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org