This is an automated email from the ASF dual-hosted git repository.
stack pushed a commit to branch branch-2.3
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.3 by this push:
new 7e7b730 HBASE-25973 Balancer should explain progress in a better way
in log -… (#3484)
7e7b730 is described below
commit 7e7b73024279676b42be059001b2115776e07576
Author: clarax <[email protected]>
AuthorDate: Fri Jul 16 15:23:09 2021 -0700
HBASE-25973 Balancer should explain progress in a better way in log -…
(#3484)
Signed-off-by: stack <[email protected]>
---
.../org/apache/hadoop/hbase/master/HMaster.java | 2 +
.../hbase/master/balancer/BaseLoadBalancer.java | 2 +
.../master/balancer/StochasticLoadBalancer.java | 115 ++++++++++++++-------
.../TestStochasticLoadBalancerBalanceCluster.java | 2 +-
4 files changed, 84 insertions(+), 37 deletions(-)
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index 85265ff..228e45c 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -1966,6 +1966,8 @@ public class HMaster extends HRegionServer implements
MasterServices {
}
}
}
+ LOG.info("Balancer is going into sleep until next period in {}ms",
getConfiguration()
+ .getInt(HConstants.HBASE_BALANCER_PERIOD,
HConstants.DEFAULT_HBASE_BALANCER_PERIOD));
return successRegionPlans;
}
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
index ee5e907..9a05a33 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
@@ -75,6 +75,8 @@ import org.slf4j.LoggerFactory;
*
*/
@InterfaceAudience.Private
[email protected](value="IS2_INCONSISTENT_SYNC",
+ justification="Complaint is about isByTable not being synchronized; we don't
modify often")
public abstract class BaseLoadBalancer implements LoadBalancer {
protected static final int MIN_SERVER_BALANCE = 2;
private volatile boolean stopped = false;
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
index a7ccc41..6b86374 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
@@ -145,7 +145,8 @@ public class StochasticLoadBalancer extends
BaseLoadBalancer {
private List<CandidateGenerator> candidateGenerators;
private List<CostFunction> costFunctions; // FindBugs: Wants this protected;
IS2_INCONSISTENT_SYNC
-
+ // To save currently configed sum of multiplier. Defaulted at 1 for cases
that carry high cost
+ private float sumMultiplier = 1.0f;
// to save and report costs to JMX
private double curOverallCost = 0d;
private double[] tempFunctionCosts;
@@ -197,7 +198,6 @@ public class StochasticLoadBalancer extends
BaseLoadBalancer {
}
regionReplicaHostCostFunction = new RegionReplicaHostCostFunction(conf);
regionReplicaRackCostFunction = new RegionReplicaRackCostFunction(conf);
-
costFunctions = new ArrayList<>();
addCostFunction(new RegionCountSkewCostFunction(conf));
addCostFunction(new PrimaryRegionCountSkewCostFunction(conf));
@@ -311,46 +311,61 @@ public class StochasticLoadBalancer extends
BaseLoadBalancer {
protected boolean needsBalance(TableName tableName, Cluster cluster) {
ClusterLoadState cs = new ClusterLoadState(cluster.clusterState);
if (cs.getNumServers() < MIN_SERVER_BALANCE) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Not running balancer because only " + cs.getNumServers()
- + " active regionserver(s)");
- }
+ LOG.info("Not running balancer because only " + cs.getNumServers() +
+ " active regionserver(s)");
return false;
}
if (areSomeRegionReplicasColocated(cluster)) {
+ LOG.info("Running balancer because at least one server hosts replicas of
the same region.");
return true;
}
if (idleRegionServerExist(cluster)){
+ LOG.info("Running balancer because cluster has idle server(s).");
return true;
}
+ sumMultiplier = 0.0f;
double total = 0.0;
- float sumMultiplier = 0.0f;
for (CostFunction c : costFunctions) {
float multiplier = c.getMultiplier();
- if (multiplier <= 0) {
- LOG.trace("{} not needed because multiplier is <= 0",
c.getClass().getSimpleName());
- continue;
- }
+ double cost = c.cost();
if (!c.isNeeded()) {
LOG.trace("{} not needed", c.getClass().getSimpleName());
continue;
}
+ total += cost * multiplier;
sumMultiplier += multiplier;
- total += c.cost() * multiplier;
+ }
+ if (sumMultiplier <= 0) {
+ LOG.error("At least one cost function needs a multiplier > 0. For
example, set "
+ + "hbase.master.balancer.stochastic.regionCountCost to a positive
value or default");
+ return false;
}
- boolean balanced = total <= 0 || sumMultiplier <= 0 ||
- (sumMultiplier > 0 && (total / sumMultiplier) < minCostNeedBalance);
- if (LOG.isDebugEnabled()) {
- LOG.debug("{} {}; total cost={}, sum multiplier={}; cost/multiplier to
need a balance is {}",
- balanced ? "Skipping load balancing because balanced" : "We need to
load balance",
- isByTable ? String.format("table (%s)", tableName) : "cluster",
- total, sumMultiplier, minCostNeedBalance);
- if (LOG.isTraceEnabled()) {
- LOG.trace("Balance decision detailed function costs={}",
functionCost());
+ boolean balanced = (total / sumMultiplier < minCostNeedBalance);
+ if (balanced) {
+ String reason = "";
+ if (total <= 0) {
+ reason =
"(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern) = " + total +
+ " <= 0";
+ } else if (sumMultiplier <= 0) {
+ reason = "sumMultiplier = " + sumMultiplier + " <= 0";
+ } else if ((total / sumMultiplier) < minCostNeedBalance) {
+ reason =
+
"[(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern)]/sumMultiplier
= " + (
+ total / sumMultiplier) + " <= minCostNeedBalance(" +
minCostNeedBalance + ")";
}
+
+ LOG.info("{} - skipping load balancing because weighted average
imbalance={} <= " +
+ "threshold({}). If you want more aggressive balancing, either lower
" +
+ "hbase.master.balancer.stochastic.minCostNeedBalance from {} or
increase the relative" +
+ " multiplier(s) of the specific cost function(s). functionCost={}",
+ isByTable ? "Table specific (" + tableName + ")" : "Cluster wide",
total / sumMultiplier,
+ minCostNeedBalance, minCostNeedBalance, functionCost());
+ } else {
+ LOG.info("{} - Calculating plan. may take up to {}ms to complete.",
+ isByTable ? "Table specific ("+tableName+")" : "Cluster wide",
maxRunningTime);
}
return !balanced;
}
@@ -426,8 +441,9 @@ public class StochasticLoadBalancer extends
BaseLoadBalancer {
maxSteps);
}
}
- LOG.info("start StochasticLoadBalancer.balancer, initCost=" + currentCost
+ ", functionCost="
- + functionCost() + " computedMaxSteps: " + computedMaxSteps);
+ LOG.info("Start StochasticLoadBalancer.balancer, initial weighted average
imbalance={}, "
+ + "functionCost={} computedMaxSteps={}",
+ currentCost / sumMultiplier, functionCost(), computedMaxSteps);
// Perform a stochastic walk to see if we can get a good fit.
long step;
@@ -472,16 +488,16 @@ public class StochasticLoadBalancer extends
BaseLoadBalancer {
updateStochasticCosts(tableName, curOverallCost, curFunctionCosts);
if (initCost > currentCost) {
plans = createRegionPlans(cluster);
- LOG.info("Finished computing new load balance plan. Computation took {}"
+
- " to try {} different iterations. Found a solution that moves " +
- "{} regions; Going from a computed cost of {}" +
- " to a new cost of {}", java.time.Duration.ofMillis(endTime -
startTime),
- step, plans.size(), initCost, currentCost);
+ LOG.info("Finished computing new moving plan. Computation took {} ms" +
+ " to try {} different iterations. Found a solution that moves " +
+ "{} regions; Going from a computed imbalance of {}" + " to a new
imbalance of {}. ",
+ endTime - startTime, step, plans.size(), initCost / sumMultiplier,
+ currentCost / sumMultiplier);
return plans;
}
- LOG.info("Could not find a better load balance plan. Tried {} different
configurations in " +
- "{}, and did not find anything with a computed cost less than {}", step,
- java.time.Duration.ofMillis(endTime - startTime), initCost);
+ LOG.info("Could not find a better moving plan. Tried {} different
configurations in " +
+ "{} ms, and did not find anything with an imbalance score less than
{}", step,
+ endTime - startTime, initCost / sumMultiplier);
return null;
}
@@ -513,7 +529,8 @@ public class StochasticLoadBalancer extends
BaseLoadBalancer {
}
private void addCostFunction(CostFunction costFunction) {
- if (costFunction.getMultiplier() > 0) {
+ float multiplier = costFunction.getMultiplier();
+ if (multiplier > 0) {
costFunctions.add(costFunction);
}
}
@@ -524,9 +541,13 @@ public class StochasticLoadBalancer extends
BaseLoadBalancer {
builder.append(c.getClass().getSimpleName());
builder.append(" : (");
if (c.isNeeded()) {
- builder.append(c.getMultiplier());
+ builder.append("multiplier=" + c.getMultiplier());
builder.append(", ");
- builder.append(c.cost());
+ double cost = c.cost();
+ builder.append("imbalance=" + cost);
+ if (cost < minCostNeedBalance) {
+ builder.append(", balanced");
+ }
} else {
builder.append("not needed");
}
@@ -535,6 +556,28 @@ public class StochasticLoadBalancer extends
BaseLoadBalancer {
return builder.toString();
}
+
+ private String totalCostsPerFunc() {
+ StringBuilder builder = new StringBuilder();
+ for (CostFunction c : costFunctions) {
+ if (!c.isNeeded()) {
+ continue;
+ }
+ double cost = c.getMultiplier() * c.cost();
+ if (cost > 0.0) {
+ builder.append(" ");
+ builder.append(c.getClass().getSimpleName());
+ builder.append(" : ");
+ builder.append(cost);
+ builder.append(";");
+ }
+ }
+ if (builder.length() > 0) {
+ builder.deleteCharAt(builder.length() - 1);
+ }
+ return builder.toString();
+ }
+
/**
* Create all of the RegionPlan's needed to move from the initial cluster
state to the desired
* state.
@@ -601,7 +644,7 @@ public class StochasticLoadBalancer extends
BaseLoadBalancer {
allowedOnPath = ".*(/src/test/.*|StochasticLoadBalancer).java")
void updateCostsWithAction(Cluster cluster, Action action) {
for (CostFunction c : costFunctions) {
- if (c.getMultiplier() > 0 && c.isNeeded()) {
+ if (c.isNeeded()) {
c.postAction(action);
}
}
@@ -640,7 +683,7 @@ public class StochasticLoadBalancer extends
BaseLoadBalancer {
CostFunction c = costFunctions.get(i);
this.tempFunctionCosts[i] = 0.0;
- if (c.getMultiplier() <= 0 || !c.isNeeded()) {
+ if (!c.isNeeded()) {
continue;
}
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerBalanceCluster.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerBalanceCluster.java
index be7eecc..c952149 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerBalanceCluster.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerBalanceCluster.java
@@ -51,7 +51,7 @@ public class TestStochasticLoadBalancerBalanceCluster extends
BalancerTestBase {
*/
@Test
public void testBalanceCluster() throws Exception {
- conf.setLong("hbase.master.balancer.stochastic.maxRunningTime", 3 * 60 *
1000); // 800 sec
+ conf.setLong("hbase.master.balancer.stochastic.maxRunningTime", 3 * 60 *
1000); // 3 min
conf.setFloat("hbase.master.balancer.stochastic.maxMovePercent", 1.0f);
conf.setLong(StochasticLoadBalancer.MAX_STEPS_KEY, 20000000L);
loadBalancer.setConf(conf);