This is an automated email from the ASF dual-hosted git repository. rmattingly pushed a commit to branch HBASE-29202-branch-2 in repository https://gitbox.apache.org/repos/asf/hbase.git
commit da4a53793d706ef652d5a31e9f6633b9c1a67f0b Author: Ray Mattingly <[email protected]> AuthorDate: Fri Mar 21 08:05:03 2025 -0400 HBASE-29202 Balancer conditionals make balancer actions more likely to be approved (#6821) Co-authored-by: Ray Mattingly <[email protected]> Signed-off-by: Nick Dimiduk <[email protected]> --- .../master/balancer/BalancerConditionals.java | 2 +- .../balancer/CandidateGeneratorTestUtil.java | 32 +++++--- ...ancingTableIsolationAndReplicaDistribution.java | 8 +- ....java => TestUnattainableBalancerCostGoal.java} | 86 +++++++++------------- 4 files changed, 61 insertions(+), 67 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java index 021a34bce6b..b82c68b37da 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java @@ -146,7 +146,7 @@ final class BalancerConditionals implements Configurable { // Reset cluster cluster.doAction(undoAction); - if (isViolatingPre && isViolatingPost) { + if (isViolatingPre == isViolatingPost) { return 0; } else if (!isViolatingPre && isViolatingPost) { return 1; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java index 03bfcce8e15..d2a9d17cdba 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java @@ -51,16 +51,22 @@ public final class CandidateGeneratorTestUtil { private CandidateGeneratorTestUtil() { } + enum ExhaustionType { + COST_GOAL_ACHIEVED, + NO_MORE_MOVES; + } + static void runBalancerToExhaustion(Configuration conf, Map<ServerName, List<RegionInfo>> serverToRegions, Set<Function<BalancerClusterState, Boolean>> expectations, float targetMaxBalancerCost) { - runBalancerToExhaustion(conf, serverToRegions, expectations, targetMaxBalancerCost, 15000); + runBalancerToExhaustion(conf, serverToRegions, expectations, targetMaxBalancerCost, 15000, + ExhaustionType.COST_GOAL_ACHIEVED); } static void runBalancerToExhaustion(Configuration conf, Map<ServerName, List<RegionInfo>> serverToRegions, Set<Function<BalancerClusterState, Boolean>> expectations, float targetMaxBalancerCost, - long maxRunningTime) { + long maxRunningTime, ExhaustionType exhaustionType) { // Do the full plan. We're testing with a lot of regions conf.setBoolean("hbase.master.balancer.stochastic.runMaxSteps", true); conf.setLong(MAX_RUNNING_TIME_KEY, maxRunningTime); @@ -76,7 +82,7 @@ public final class CandidateGeneratorTestUtil { boolean isBalanced = false; while (!isBalanced) { balancerRuns++; - if (balancerRuns > 1000) { + if (balancerRuns > 10) { throw new RuntimeException("Balancer failed to find balance & meet expectations"); } long start = System.currentTimeMillis(); @@ -111,16 +117,24 @@ public final class CandidateGeneratorTestUtil { } } if (isBalanced) { // Check if the balancer thinks we're done too - LOG.info("All balancer conditions passed. Checking if balancer thinks it's done."); - if (stochasticLoadBalancer.needsBalance(HConstants.ENSEMBLE_TABLE_NAME, cluster)) { - LOG.info("Balancer would still like to run"); - isBalanced = false; + if (exhaustionType == ExhaustionType.COST_GOAL_ACHIEVED) { + // If we expect to achieve the cost goal, then needsBalance should be false + if (stochasticLoadBalancer.needsBalance(HConstants.ENSEMBLE_TABLE_NAME, cluster)) { + LOG.info("Balancer cost goal is not achieved. needsBalance=true"); + isBalanced = false; + } } else { - LOG.info("Balancer is done"); + // If we anticipate running out of moves, then our last balance run should have produced + // nothing + if (regionPlans != null && !regionPlans.isEmpty()) { + LOG.info("Balancer is not out of moves. regionPlans.size()={}", regionPlans.size()); + isBalanced = false; + } } } } - LOG.info("Balancing took {}sec", Duration.ofMillis(balancingMillis).toMinutes()); + LOG.info("Balancer is done. Balancing took {}sec", + Duration.ofMillis(balancingMillis).toMinutes()); } /** diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java index 3a28ae801e4..bc31530f492 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java @@ -104,10 +104,10 @@ public class TestLargeClusterBalancingTableIsolationAndReplicaDistribution { conf.setBoolean(BalancerConditionals.ISOLATE_META_TABLE_KEY, true); conf.setBoolean(BalancerConditionals.ISOLATE_SYSTEM_TABLES_KEY, true); DistributeReplicasTestConditional.enableConditionalReplicaDistributionForTest(conf); - - runBalancerToExhaustion(conf, serverToRegions, ImmutableSet.of(this::isMetaTableIsolated, - this::isSystemTableIsolated, CandidateGeneratorTestUtil::areAllReplicasDistributed), 10.0f, - 60_000); + runBalancerToExhaustion(conf, serverToRegions, + ImmutableSet.of(this::isMetaTableIsolated, this::isSystemTableIsolated, + CandidateGeneratorTestUtil::areAllReplicasDistributed), + 10.0f, 60_000, CandidateGeneratorTestUtil.ExhaustionType.COST_GOAL_ACHIEVED); LOG.info("Meta table regions are successfully isolated, " + "and region replicas are appropriately distributed."); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java similarity index 53% copy from hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java copy to hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java index 3a28ae801e4..ffa2b4a7821 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java @@ -24,7 +24,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Random; +import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.ServerName; @@ -40,23 +40,27 @@ import org.junit.experimental.categories.Category; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableSet; - -@Category({ MediumTests.class, MasterTests.class }) -public class TestLargeClusterBalancingTableIsolationAndReplicaDistribution { +/** + * If your minCostNeedsBalance is set too low, then the balancer should still eventually stop making + * moves as further cost improvements become impossible, and balancer plan calculation becomes + * wasteful. This test ensures that the balancer will not get stuck in a loop of continuously moving + * regions. + */ +@Category({ MasterTests.class, MediumTests.class }) +public class TestUnattainableBalancerCostGoal { @ClassRule - public static final HBaseClassTestRule CLASS_RULE = HBaseClassTestRule - .forClass(TestLargeClusterBalancingTableIsolationAndReplicaDistribution.class); + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestUnattainableBalancerCostGoal.class); + + private static final Logger LOG = LoggerFactory.getLogger(TestUnattainableBalancerCostGoal.class); - private static final Logger LOG = - LoggerFactory.getLogger(TestLargeClusterBalancingTableIsolationAndReplicaDistribution.class); private static final TableName SYSTEM_TABLE_NAME = TableName.valueOf("hbase:system"); - private static final TableName NON_ISOLATED_TABLE_NAME = TableName.valueOf("userTable"); + private static final TableName NON_SYSTEM_TABLE_NAME = TableName.valueOf("userTable"); - private static final int NUM_SERVERS = 500; - private static final int NUM_REGIONS = 2_500; - private static final int NUM_REPLICAS = 3; + private static final int NUM_SERVERS = 10; + private static final int NUM_REGIONS = 1000; + private static final float UNACHIEVABLE_COST_GOAL = 0.01f; private static final ServerName[] servers = new ServerName[NUM_SERVERS]; private static final Map<ServerName, List<RegionInfo>> serverToRegions = new HashMap<>(); @@ -66,62 +70,38 @@ public class TestLargeClusterBalancingTableIsolationAndReplicaDistribution { // Initialize servers for (int i = 0; i < NUM_SERVERS; i++) { servers[i] = ServerName.valueOf("server" + i, i, System.currentTimeMillis()); - serverToRegions.put(servers[i], new ArrayList<>()); } - // Create primary regions and their replicas + // Create regions + List<RegionInfo> allRegions = new ArrayList<>(); for (int i = 0; i < NUM_REGIONS; i++) { - TableName tableName; - if (i < 1) { - tableName = TableName.META_TABLE_NAME; - } else if (i < 10) { - tableName = SYSTEM_TABLE_NAME; - } else { - tableName = NON_ISOLATED_TABLE_NAME; - } - - // Define startKey and endKey for the region + TableName tableName = i < 3 ? SYSTEM_TABLE_NAME : NON_SYSTEM_TABLE_NAME; byte[] startKey = new byte[1]; startKey[0] = (byte) i; byte[] endKey = new byte[1]; endKey[0] = (byte) (i + 1); - Random random = new Random(); - // Create 3 replicas for each primary region - for (int replicaId = 0; replicaId < NUM_REPLICAS; replicaId++) { - RegionInfo regionInfo = RegionInfoBuilder.newBuilder(tableName).setStartKey(startKey) - .setEndKey(endKey).setReplicaId(replicaId).build(); - // Assign region to random server - int randomServer = random.nextInt(servers.length); - serverToRegions.get(servers[randomServer]).add(regionInfo); - } + RegionInfo regionInfo = + RegionInfoBuilder.newBuilder(tableName).setStartKey(startKey).setEndKey(endKey).build(); + allRegions.add(regionInfo); + } + + // Assign all regions to the first server + serverToRegions.put(servers[0], new ArrayList<>(allRegions)); + for (int i = 1; i < NUM_SERVERS; i++) { + serverToRegions.put(servers[i], new ArrayList<>()); } } @Test - public void testTableIsolationAndReplicaDistribution() { + public void testSystemTableIsolation() { Configuration conf = new Configuration(false); - conf.setBoolean(BalancerConditionals.ISOLATE_META_TABLE_KEY, true); conf.setBoolean(BalancerConditionals.ISOLATE_SYSTEM_TABLES_KEY, true); - DistributeReplicasTestConditional.enableConditionalReplicaDistributionForTest(conf); - - runBalancerToExhaustion(conf, serverToRegions, ImmutableSet.of(this::isMetaTableIsolated, - this::isSystemTableIsolated, CandidateGeneratorTestUtil::areAllReplicasDistributed), 10.0f, - 60_000); - LOG.info("Meta table regions are successfully isolated, " - + "and region replicas are appropriately distributed."); - } - - /** - * Validates whether all meta table regions are isolated. - */ - private boolean isMetaTableIsolated(BalancerClusterState cluster) { - return isTableIsolated(cluster, TableName.META_TABLE_NAME, "Meta"); + runBalancerToExhaustion(conf, serverToRegions, Set.of(this::isSystemTableIsolated), + UNACHIEVABLE_COST_GOAL, 10_000, CandidateGeneratorTestUtil.ExhaustionType.NO_MORE_MOVES); + LOG.info("Meta table regions are successfully isolated."); } - /** - * Validates whether all meta table regions are isolated. - */ private boolean isSystemTableIsolated(BalancerClusterState cluster) { return isTableIsolated(cluster, SYSTEM_TABLE_NAME, "System"); }
