ddanielr commented on code in PR #6049:
URL: https://github.com/apache/accumulo/pull/6049#discussion_r2873960999


##########
server/manager/src/main/java/org/apache/accumulo/manager/Manager.java:
##########
@@ -1192,15 +1219,35 @@ private List<TabletMigration> 
checkMigrationSanity(Set<TabletServerId> current,
               > MAX_BAD_STATUS_COUNT) {
             if (shutdownServerRateLimiter.tryAcquire()) {
               log.warn("attempting to stop {}", server);
-              try {
-                TServerConnection connection2 = 
tserverSet.getConnection(server);
-                if (connection2 != null) {
-                  connection2.halt(managerLock);
+              var gracefulHaltTimer = 
tserverHaltRpcAttempts.computeIfAbsent(server,
+                  s -> new GracefulHaltTimer(getConfiguration()));
+              if (gracefulHaltTimer.shouldForceHalt()) {
+                log.warn("tserver {} is not responding to halt requests, 
deleting zlock", server);
+                var zk = getContext().getZooReaderWriter();
+                var iid = getContext().getInstanceID();
+                String tserversPath = Constants.ZROOT + "/" + iid + 
Constants.ZTSERVERS;
+                try {
+                  ServiceLock.deleteLocks(zk, tserversPath, 
server.getHostAndPort()::equals,
+                      log::info, false);
+                  tserverHaltRpcAttempts.remove(server);
+                  badServers.remove(server);
+                } catch (KeeperException | InterruptedException e) {
+                  log.error("Failed to delete zlock for server {}", server, e);
+                }
+              } else {
+                try {
+                  TServerConnection connection2 = 
tserverSet.getConnection(server);
+                  if (connection2 != null) {
+                    connection2.halt(managerLock);
+                  }
+                } catch (TTransportException e1) {
+                  // ignore: it's probably down so log the exception at trace
+                  log.trace("error attempting to halt tablet server {}", 
server, e1);
+                } catch (Exception e2) {
+                  log.info("error talking to troublesome tablet server {}", 
server, e2);
+                } finally {
+                  gracefulHaltTimer.startTimer();

Review Comment:
   addressed in 8dcc2ee



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to