This is an automated email from the ASF dual-hosted git repository. jxue pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/helix.git
The following commit(s) were added to refs/heads/master by this push: new 6609c3773 [apache/helix] -- Added detail in the Exception message for WAGED rebalance (hard constraint) failures. (#2829) 6609c3773 is described below commit 6609c377322bd7f5fceb035fc299001a8583eaea Author: Himanshu Kandwal <himanshuk...@gmail.com> AuthorDate: Wed Jul 17 10:53:15 2024 -0700 [apache/helix] -- Added detail in the Exception message for WAGED rebalance (hard constraint) failures. (#2829) We are enabling constraint level logging in the case when WAGED algorithm is not able to find the placement for a replica. The logging will be controlled via a flag which is enabled when the failure criteria is met, else is disabled. --- .../constraints/ConstraintBasedAlgorithm.java | 25 +++++++++++ .../constraints/FaultZoneAwareConstraint.java | 6 ++- .../waged/constraints/HardConstraint.java | 10 +++++ .../waged/constraints/NodeCapacityConstraint.java | 6 ++- .../NodeMaxPartitionLimitConstraint.java | 6 ++- .../constraints/ReplicaActivateConstraint.java | 4 +- .../SamePartitionOnInstanceConstraint.java | 4 +- .../waged/constraints/ValidGroupTagConstraint.java | 4 +- .../rebalancer/waged/model/ClusterContext.java | 4 ++ .../constraints/TestConstraintBasedAlgorithm.java | 48 ++++++++++++++++++++-- 10 files changed, 104 insertions(+), 13 deletions(-) diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ConstraintBasedAlgorithm.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ConstraintBasedAlgorithm.java index 60d43764a..e7510987a 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ConstraintBasedAlgorithm.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ConstraintBasedAlgorithm.java @@ -22,6 +22,7 @@ package org.apache.helix.controller.rebalancer.waged.constraints; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; @@ -30,6 +31,7 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Maps; import org.apache.helix.HelixRebalanceException; import org.apache.helix.controller.rebalancer.waged.RebalanceAlgorithm; @@ -136,11 +138,16 @@ class ConstraintBasedAlgorithm implements RebalanceAlgorithm { }).collect(Collectors.toList()); if (candidateNodes.isEmpty()) { + LOG.info("Found no eligible candidate nodes. Enabling hard constraint level logging for cluster: {}", clusterContext.getClusterName()); + enableFullLoggingForCluster(); optimalAssignment.recordAssignmentFailure(replica, Maps.transformValues(hardConstraintFailures, this::convertFailureReasons)); return Optional.empty(); } + LOG.info("Disabling hard constraint level logging for cluster: {}", clusterContext.getClusterName()); + removeFullLoggingForCluster(); + return candidateNodes.parallelStream().map(node -> new HashMap.SimpleEntry<>(node, getAssignmentNormalizedScore(node, replica, clusterContext))) .max((nodeEntry1, nodeEntry2) -> { @@ -179,6 +186,24 @@ class ConstraintBasedAlgorithm implements RebalanceAlgorithm { .collect(Collectors.toList()); } + /** + * Enables logging of failures in all hard constraints + */ + private void enableFullLoggingForCluster() { + for (HardConstraint hardConstraint : _hardConstraints) { + hardConstraint.setEnableLogging(true); + } + } + + /** + * Removes the cluster from full logging in all hard constraints (if added previously) + */ + private void removeFullLoggingForCluster() { + for (HardConstraint hardConstraint : _hardConstraints) { + hardConstraint.setEnableLogging(false); + } + } + private static class AssignableReplicaWithScore implements Comparable<AssignableReplicaWithScore> { private final AssignableReplica _replica; private float _score = 0; diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/FaultZoneAwareConstraint.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/FaultZoneAwareConstraint.java index e0992b5d4..459e35853 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/FaultZoneAwareConstraint.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/FaultZoneAwareConstraint.java @@ -42,8 +42,10 @@ class FaultZoneAwareConstraint extends HardConstraint { clusterContext.getPartitionsForResourceAndFaultZone(replica.getResourceName(), node.getFaultZone()); if (partitionsForResourceAndFaultZone.contains(replica.getPartitionName())) { - LOG.debug("A fault zone cannot contain more than 1 replica of same partition. Found replica for partition: {}", - replica.getPartitionName()); + if (enableLogging) { + LOG.info("A fault zone cannot contain more than 1 replica of same partition. Found replica for partition: {}", + replica.getPartitionName()); + } return false; } return true; diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/HardConstraint.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/HardConstraint.java index a0e2ff8c4..281adae34 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/HardConstraint.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/HardConstraint.java @@ -29,6 +29,8 @@ import org.apache.helix.controller.rebalancer.waged.model.ClusterContext; */ abstract class HardConstraint { + protected boolean enableLogging = false; + /** * Check if the replica could be assigned to the node * @return True if the proposed assignment is valid; False otherwise @@ -44,4 +46,12 @@ abstract class HardConstraint { String getDescription() { return getClass().getName(); } + + /** + * Sets the flag to enable constraint level logging + */ + public void setEnableLogging(boolean enableLogging) { + this.enableLogging = enableLogging; + } + } diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeCapacityConstraint.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeCapacityConstraint.java index 056d7f842..a686037de 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeCapacityConstraint.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeCapacityConstraint.java @@ -40,8 +40,10 @@ class NodeCapacityConstraint extends HardConstraint { for (String key : replicaCapacity.keySet()) { if (nodeCapacity.containsKey(key)) { if (nodeCapacity.get(key) < replicaCapacity.get(key)) { - LOG.debug("Node has insufficient capacity for: {}. Left available: {}, Required: {}", - key, nodeCapacity.get(key), replicaCapacity.get(key)); + if (enableLogging) { + LOG.info("Node has insufficient capacity for: {}. Left available: {}, Required: {}", + key, nodeCapacity.get(key), replicaCapacity.get(key)); + } return false; } } diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeMaxPartitionLimitConstraint.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeMaxPartitionLimitConstraint.java index deedf4aa5..08b2ee19e 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeMaxPartitionLimitConstraint.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeMaxPartitionLimitConstraint.java @@ -48,8 +48,10 @@ class NodeMaxPartitionLimitConstraint extends HardConstraint { || assignedPartitionsByResourceSize < resourceMaxPartitionsPerInstance; if (!exceedResourceMaxPartitionLimit) { - LOG.debug("Cannot exceed the max number of partitions per resource ({}) limitation on node. Assigned replica count: {}", - resourceMaxPartitionsPerInstance, assignedPartitionsByResourceSize); + if (enableLogging) { + LOG.info("Cannot exceed the max number of partitions per resource ({}) limitation on node. Assigned replica count: {}", + resourceMaxPartitionsPerInstance, assignedPartitionsByResourceSize); + } return false; } return true; diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ReplicaActivateConstraint.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ReplicaActivateConstraint.java index 13e380a0a..efee2a219 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ReplicaActivateConstraint.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ReplicaActivateConstraint.java @@ -38,7 +38,9 @@ class ReplicaActivateConstraint extends HardConstraint { List<String> disabledPartitions = node.getDisabledPartitionsMap().get(replica.getResourceName()); if (disabledPartitions != null && disabledPartitions.contains(replica.getPartitionName())) { - LOG.debug("Cannot assign the inactive replica: {}", replica.getPartitionName()); + if (enableLogging) { + LOG.info("Cannot assign the inactive replica: {}", replica.getPartitionName()); + } return false; } return true; diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/SamePartitionOnInstanceConstraint.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/SamePartitionOnInstanceConstraint.java index e6de9989c..3dd7c65bb 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/SamePartitionOnInstanceConstraint.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/SamePartitionOnInstanceConstraint.java @@ -37,7 +37,9 @@ class SamePartitionOnInstanceConstraint extends HardConstraint { Set<String> assignedPartitionsByResource = node.getAssignedPartitionsByResource(replica.getResourceName()); if (assignedPartitionsByResource.contains(replica.getPartitionName())) { - LOG.debug("Same partition ({}) of different states cannot co-exist in one instance", replica.getPartitionName()); + if (enableLogging) { + LOG.info("Same partition ({}) of different states cannot co-exist in one instance", replica.getPartitionName()); + } return false; } return true; diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ValidGroupTagConstraint.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ValidGroupTagConstraint.java index b1e257c87..d0609e005 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ValidGroupTagConstraint.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ValidGroupTagConstraint.java @@ -38,7 +38,9 @@ class ValidGroupTagConstraint extends HardConstraint { } if (!node.getInstanceTags().contains(replica.getResourceInstanceGroupTag())) { - LOG.debug("Instance doesn't have the tag of the replica ({})", replica.getResourceInstanceGroupTag()); + if (enableLogging) { + LOG.info("Instance doesn't have the tag of the replica ({})", replica.getResourceInstanceGroupTag()); + } return false; } return true; diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/model/ClusterContext.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/model/ClusterContext.java index 1b3ad2a87..32debca65 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/model/ClusterContext.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/model/ClusterContext.java @@ -183,6 +183,10 @@ public class ClusterContext { return _clusterCapacityMap; } + public String getClusterName() { + return _clusterName; + } + public Set<String> getPartitionsForResourceAndFaultZone(String resourceName, String faultZoneId) { return _assignmentForFaultZoneMap.getOrDefault(faultZoneId, Collections.emptyMap()) .getOrDefault(resourceName, Collections.emptySet()); diff --git a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestConstraintBasedAlgorithm.java b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestConstraintBasedAlgorithm.java index 0a75002f1..1b7b1b944 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestConstraintBasedAlgorithm.java +++ b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestConstraintBasedAlgorithm.java @@ -19,12 +19,13 @@ package org.apache.helix.controller.rebalancer.waged.constraints; * under the License. */ -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Set; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import org.apache.helix.HelixRebalanceException; import org.apache.helix.controller.rebalancer.waged.model.AssignableReplica; import org.apache.helix.controller.rebalancer.waged.model.ClusterModel; @@ -33,14 +34,20 @@ import org.apache.helix.controller.rebalancer.waged.model.OptimalAssignment; import org.testng.Assert; import org.testng.annotations.Test; +import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Matchers.any; +import static org.mockito.Mockito.atLeastOnce; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; public class TestConstraintBasedAlgorithm { - @Test(expectedExceptions = HelixRebalanceException.class) - public void testCalculateNoValidAssignment() throws IOException, HelixRebalanceException { + + @Test + public void testCalculateNoValidAssignment() throws IOException { HardConstraint mockHardConstraint = mock(HardConstraint.class); SoftConstraint mockSoftConstraint = mock(SoftConstraint.class); when(mockHardConstraint.isAssignmentValid(any(), any(), any())).thenReturn(false); @@ -49,7 +56,40 @@ public class TestConstraintBasedAlgorithm { new ConstraintBasedAlgorithm(ImmutableList.of(mockHardConstraint), ImmutableMap.of(mockSoftConstraint, 1f)); ClusterModel clusterModel = new ClusterModelTestHelper().getDefaultClusterModel(); + try { + algorithm.calculate(clusterModel); + } catch (HelixRebalanceException ex) { + Assert.assertEquals(ex.getFailureType(), HelixRebalanceException.Type.FAILED_TO_CALCULATE); + } + + verify(mockHardConstraint, times(1)).setEnableLogging(eq(true)); + verify(mockHardConstraint, times(1)).isAssignmentValid(any(), any(), any()); + } + + @Test + public void testCalculateNoValidAssignmentFirstAndThenRecovery() throws IOException, HelixRebalanceException { + HardConstraint mockHardConstraint = mock(HardConstraint.class); + SoftConstraint mockSoftConstraint = mock(SoftConstraint.class); + when(mockHardConstraint.isAssignmentValid(any(), any(), any())) + .thenReturn(false) // hard constraint fails + .thenReturn(true); // hard constraint recovers + when(mockSoftConstraint.getAssignmentNormalizedScore(any(), any(), any())).thenReturn(1.0); + ConstraintBasedAlgorithm algorithm = + new ConstraintBasedAlgorithm(ImmutableList.of(mockHardConstraint), + ImmutableMap.of(mockSoftConstraint, 1f)); + ClusterModel clusterModel = new ClusterModelTestHelper().getDefaultClusterModel(); + try { + algorithm.calculate(clusterModel); + } catch (HelixRebalanceException ex) { + Assert.assertEquals(ex.getFailureType(), HelixRebalanceException.Type.FAILED_TO_CALCULATE); + } + + verify(mockHardConstraint, times(1)).setEnableLogging(eq(true)); + verify(mockHardConstraint, times(1)).isAssignmentValid(any(), any(), any()); + + // calling again for recovery (no exception) algorithm.calculate(clusterModel); + verify(mockHardConstraint, atLeastOnce()).setEnableLogging(eq(false)); } @Test