This is an automated email from the ASF dual-hosted git repository.
zhangduo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/master by this push:
new 82806f8e903 HBASE-28023 ITBLL's RollingBatchSuspendResumeRsAction
verify the success of the suspendRs action. (#6570)
82806f8e903 is described below
commit 82806f8e903d2af9638be5539cf99392bd822e41
Author: hiping-tech <[email protected]>
AuthorDate: Mon Jan 6 17:47:32 2025 +0800
HBASE-28023 ITBLL's RollingBatchSuspendResumeRsAction verify the success
of the suspendRs action. (#6570)
Co-authored-by: lvhaiping.lhp <[email protected]>
Signed-off-by: Duo Zhang <[email protected]>
(cherry picked from commit c9f4cc9ad47b0d3993a4bf5c43d16e9f3dbcd9f7)
---
.../org/apache/hadoop/hbase/ClusterManager.java | 12 +++++++
.../hadoop/hbase/DistributedHBaseCluster.java | 38 ++++++++++++++++++++++
.../apache/hadoop/hbase/HBaseClusterManager.java | 18 ++++++++++
.../apache/hadoop/hbase/RESTApiClusterManager.java | 10 ++++++
.../apache/hadoop/hbase/ZNodeClusterManager.java | 12 +++++++
.../apache/hadoop/hbase/chaos/actions/Action.java | 4 +--
.../actions/RollingBatchSuspendResumeRsAction.java | 2 +-
.../chaos/monkies/PolicyBasedChaosMonkey.java | 1 +
.../apache/hadoop/hbase/HBaseClusterInterface.java | 14 ++++++++
.../hadoop/hbase/SingleProcessHBaseCluster.java | 10 ++++++
10 files changed, 118 insertions(+), 3 deletions(-)
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java
b/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java
index 9fe59828062..2fc13592893 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java
@@ -93,6 +93,18 @@ interface ClusterManager extends Configurable {
*/
boolean isRunning(ServiceType service, String hostname, int port) throws
IOException;
+ /**
+ * Returns whether the service is suspended on the remote host. This only
checks whether the
+ * service status is suspended.
+ */
+ boolean isSuspended(ServiceType service, String hostname, int port) throws
IOException;
+
+ /**
+ * Returns whether the service is resumed on the remote host. This only
checks whether the service
+ * status is resumed.
+ */
+ boolean isResumed(ServiceType service, String hostname, int port) throws
IOException;
+
/*
* TODO: further API ideas: //return services running on host: ServiceType[]
* getRunningServicesOnHost(String hostname); //return which services can be
run on host (for
diff --git
a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
index e88835a9da4..85313496d57 100644
---
a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
+++
b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
@@ -131,6 +131,16 @@ public class DistributedHBaseCluster extends
HBaseClusterInterface {
waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
}
+ @Override
+ public void waitForRegionServerToSuspend(ServerName serverName, long
timeout) throws IOException {
+ waitForServiceToSuspend(ServiceType.HBASE_REGIONSERVER, serverName,
timeout);
+ }
+
+ @Override
+ public void waitForRegionServerToResume(ServerName serverName, long timeout)
throws IOException {
+ waitForServiceToResume(ServiceType.HBASE_REGIONSERVER, serverName,
timeout);
+ }
+
@Override
public void suspendRegionServer(ServerName serverName) throws IOException {
LOG.info("Suspend RS: {}", serverName.getServerName());
@@ -296,6 +306,34 @@ public class DistributedHBaseCluster extends
HBaseClusterInterface {
throw new IOException("Timed-out waiting for service to start: " +
serverName);
}
+ private void waitForServiceToSuspend(ServiceType service, ServerName
serverName, long timeout)
+ throws IOException {
+ LOG.info("Waiting for service: {} to suspend: {}", service,
serverName.getServerName());
+ long start = System.currentTimeMillis();
+
+ while ((System.currentTimeMillis() - start) < timeout) {
+ if (clusterManager.isSuspended(service, serverName.getHostname(),
serverName.getPort())) {
+ return;
+ }
+ Threads.sleep(100);
+ }
+ throw new IOException("Timed-out waiting for service to suspend: " +
serverName);
+ }
+
+ private void waitForServiceToResume(ServiceType service, ServerName
serverName, long timeout)
+ throws IOException {
+ LOG.info("Waiting for service: {} to resume: {}", service,
serverName.getServerName());
+ long start = System.currentTimeMillis();
+
+ while ((System.currentTimeMillis() - start) < timeout) {
+ if (clusterManager.isResumed(service, serverName.getHostname(),
serverName.getPort())) {
+ return;
+ }
+ Threads.sleep(100);
+ }
+ throw new IOException("Timed-out waiting for service to resume: " +
serverName);
+ }
+
@Override
public void startMaster(String hostname, int port) throws IOException {
LOG.info("Starting Master on: {}:{}", hostname, port);
diff --git
a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
index b16ac52b696..6ba06862729 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
@@ -216,6 +216,10 @@ public class HBaseClusterManager extends Configured
implements ClusterManager {
service);
}
+ protected String getStateCommand(ServiceType service) {
+ return String.format("%s | xargs ps -o state= -p ",
findPidCommand(service));
+ }
+
public String signalCommand(ServiceType service, String signal) {
return String.format("%s | xargs kill -s %s", findPidCommand(service),
signal);
}
@@ -465,4 +469,18 @@ public class HBaseClusterManager extends Configured
implements ClusterManager {
public void resume(ServiceType service, String hostname, int port) throws
IOException {
signal(service, Signal.SIGCONT, hostname);
}
+
+ public boolean isSuspended(ServiceType service, String hostname, int port)
throws IOException {
+ String ret =
+ execWithRetries(hostname, service,
getCommandProvider(service).getStateCommand(service))
+ .getSecond();
+ return ret != null && ret.trim().equals("T");
+ }
+
+ public boolean isResumed(ServiceType service, String hostname, int port)
throws IOException {
+ String ret =
+ execWithRetries(hostname, service,
getCommandProvider(service).getStateCommand(service))
+ .getSecond();
+ return ret != null && !ret.trim().equals("T");
+ }
}
diff --git
a/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java
b/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java
index 8c5339605e2..8b00d1372f4 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java
@@ -264,6 +264,16 @@ public class RESTApiClusterManager extends Configured
implements ClusterManager
hBaseClusterManager.resume(service, hostname, port);
}
+ @Override
+ public boolean isSuspended(ServiceType service, String hostname, int port)
throws IOException {
+ return hBaseClusterManager.isSuspended(service, hostname, port);
+ }
+
+ @Override
+ public boolean isResumed(ServiceType service, String hostname, int port)
throws IOException {
+ return hBaseClusterManager.isResumed(service, hostname, port);
+ }
+
// Convenience method to execute command against role on hostname. Only
graceful commands are
// supported since cluster management APIs don't tend to let you SIGKILL
things.
private void performClusterManagerCommand(ServiceType role, String hostname,
diff --git
a/hbase-it/src/test/java/org/apache/hadoop/hbase/ZNodeClusterManager.java
b/hbase-it/src/test/java/org/apache/hadoop/hbase/ZNodeClusterManager.java
index 2a75f670f5b..fefa1248b73 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/ZNodeClusterManager.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/ZNodeClusterManager.java
@@ -111,6 +111,18 @@ public class ZNodeClusterManager extends Configured
implements ClusterManager {
CmdType.bool.toString() +
getCommandProvider(service).isRunningCommand(service)));
}
+ @Override
+ public boolean isSuspended(ServiceType service, String hostname, int port)
throws IOException {
+ String ret = createZNode(hostname,
getCommandProvider(service).getStateCommand(service));
+ return ret != null && ret.trim().equals("T");
+ }
+
+ @Override
+ public boolean isResumed(ServiceType service, String hostname, int port)
throws IOException {
+ String ret = createZNode(hostname,
getCommandProvider(service).getStateCommand(service));
+ return ret != null && !ret.trim().equals("T");
+ }
+
enum CmdType {
exec,
bool
diff --git
a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
index 43bff05774e..65251328a82 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
@@ -197,7 +197,7 @@ public abstract class Action {
getLogger().info("Suspending regionserver {}", server);
cluster.suspendRegionServer(server);
if (!(cluster instanceof SingleProcessHBaseCluster)) {
- cluster.waitForRegionServerToStop(server, killRsTimeout);
+ cluster.waitForRegionServerToSuspend(server, killRsTimeout);
}
getLogger().info("Suspending regionserver {}. Reported num of rs:{}",
server,
cluster.getClusterMetrics().getLiveServerMetrics().size());
@@ -207,7 +207,7 @@ public abstract class Action {
getLogger().info("Resuming regionserver {}", server);
cluster.resumeRegionServer(server);
if (!(cluster instanceof SingleProcessHBaseCluster)) {
- cluster.waitForRegionServerToStart(server.getHostname(),
server.getPort(), startRsTimeout);
+ cluster.waitForRegionServerToResume(server, startRsTimeout);
}
getLogger().info("Resuming regionserver {}. Reported num of rs:{}", server,
cluster.getClusterMetrics().getLiveServerMetrics().size());
diff --git
a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
index 07261bb58e0..559dec829ee 100644
---
a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
+++
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
@@ -65,7 +65,7 @@ public class RollingBatchSuspendResumeRsAction extends Action
{
@Override
public void perform() throws Exception {
- getLogger().info("Performing action: Rolling batch restarting {}% of
region servers",
+ getLogger().info("Performing action: Rolling batch suspending {}% of
region servers",
(int) (ratio * 100));
List<ServerName> selectedServers = selectServers();
Queue<ServerName> serversToBeSuspended = new ArrayDeque<>(selectedServers);
diff --git
a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
index f5af796b575..fb8ab209c3a 100644
---
a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
+++
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
@@ -85,6 +85,7 @@ public class PolicyBasedChaosMonkey extends ChaosMonkey {
private static ExecutorService buildMonkeyThreadPool(final int size) {
return Executors.newFixedThreadPool(size, new
ThreadFactoryBuilder().setDaemon(false)
.setNameFormat("ChaosMonkey-%d").setUncaughtExceptionHandler((t, e) -> {
+ LOG.error("Uncaught exception in thread {}", t.getName(), e);
throw new RuntimeException(e);
}).build());
}
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseClusterInterface.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseClusterInterface.java
index 3602997e398..f56fc57dd2d 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseClusterInterface.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseClusterInterface.java
@@ -160,6 +160,20 @@ public abstract class HBaseClusterInterface implements
Closeable, Configurable {
*/
public abstract void resumeRegionServer(ServerName serverName) throws
IOException;
+ /**
+ * Wait for the specified region server to suspend the thread / process.
+ * @throws IOException if something goes wrong or timeout occurs
+ */
+ public abstract void waitForRegionServerToSuspend(ServerName serverName,
long timeout)
+ throws IOException;
+
+ /**
+ * Wait for the specified region server to resume the thread / process.
+ * @throws IOException if something goes wrong or timeout occurs
+ */
+ public abstract void waitForRegionServerToResume(ServerName serverName, long
timeout)
+ throws IOException;
+
/**
* Starts a new zookeeper node on the given hostname or if this is a
mini/local cluster, silently
* logs warning message.
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/SingleProcessHBaseCluster.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/SingleProcessHBaseCluster.java
index 4ea4d73dd98..df4f30befa3 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/SingleProcessHBaseCluster.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/SingleProcessHBaseCluster.java
@@ -297,6 +297,16 @@ public class SingleProcessHBaseCluster extends
HBaseClusterInterface {
waitOnRegionServer(getRegionServerIndex(serverName));
}
+ @Override
+ public void waitForRegionServerToSuspend(ServerName serverName, long
timeout) throws IOException {
+ LOG.warn("Waiting for regionserver to suspend on mini cluster is not
supported");
+ }
+
+ @Override
+ public void waitForRegionServerToResume(ServerName serverName, long timeout)
throws IOException {
+ LOG.warn("Waiting for regionserver to resume on mini cluster is not
supported");
+ }
+
@Override
public void startZkNode(String hostname, int port) throws IOException {
LOG.warn("Starting zookeeper nodes on mini cluster is not supported");