[GitHub] [hbase] bharathv commented on a change in pull request #1755: HBASE-24069 Provide an ExponentialBackOffPolicy sleep between failed …

GitBox Fri, 29 May 2020 16:23:49 -0700


bharathv commented on a change in pull request #1755:
URL: https://github.com/apache/hbase/pull/1755#discussion_r432774809




##########
File path: 
hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
##########
@@ -1995,97 +1995,100 @@ private void unassign(final HRegionInfo region,
         }
         return;
       }
+      long sleepTime = 0;
       try {
         // Send CLOSE RPC
-        if (serverManager.sendRegionClose(server, region,
-          versionOfClosingNode, dest, transitionInZK)) {
-          LOG.debug("Sent CLOSE to " + server + " for region " +
-            region.getRegionNameAsString());
+        if (serverManager.sendRegionClose(server, region, 
versionOfClosingNode, dest,
+          transitionInZK)) {
+          LOG.debug("Sent CLOSE to " + server + " for region " + 
region.getRegionNameAsString());
           if (useZKForAssignment && !transitionInZK && state != null) {
             // Retry to make sure the region is
             // closed so as to avoid double assignment.
-            unassign(region, state, versionOfClosingNode,
-              dest, transitionInZK, src);
+            unassign(region, state, versionOfClosingNode, dest, 
transitionInZK, src);
           }
           return;
         }
         // This never happens. Currently regionserver close always return true.
         // Todo; this can now happen (0.96) if there is an exception in a 
coprocessor
-        LOG.warn("Server " + server + " region CLOSE RPC returned false for " +
-          region.getRegionNameAsString());
-      } catch (Throwable t) {
-        long sleepTime = 0;
+        LOG.warn("Server " + server + " region CLOSE RPC returned false for "
+            + region.getRegionNameAsString());
+      } catch (Throwable t) {       
         Configuration conf = this.server.getConfiguration();
         if (t instanceof RemoteException) {
-          t = ((RemoteException)t).unwrapRemoteException();
+          t = ((RemoteException) t).unwrapRemoteException();
         }
         boolean logRetries = true;
-        if (t instanceof RegionServerAbortedException
-            || t instanceof RegionServerStoppedException
+        if (t instanceof RegionServerAbortedException || t instanceof 
RegionServerStoppedException

Review comment:
       nit: (t instanceof RegionServerAbortedException) is redundant (per 
static analysis check in the IDE)...

##########
File path: 
hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
##########
@@ -598,6 +601,68 @@ public void testCloseFailed() throws Exception {
     }
   }
 
+  /**
+   * This tests region close with exponential backoff
+   */
+  @Test(timeout = 60000)
+  public void testCloseRegionWithExponentialBackOff() throws Exception {
+    String table = "testCloseRegionWithExponentialBackOff";

Review comment:
       nit: We use TestTableName.getTableName() that automatically gets the 
TableName with the running test name.

##########
File path: 
hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
##########
@@ -1995,97 +1995,100 @@ private void unassign(final HRegionInfo region,
         }
         return;
       }
+      long sleepTime = 0;
       try {
         // Send CLOSE RPC
-        if (serverManager.sendRegionClose(server, region,
-          versionOfClosingNode, dest, transitionInZK)) {
-          LOG.debug("Sent CLOSE to " + server + " for region " +
-            region.getRegionNameAsString());
+        if (serverManager.sendRegionClose(server, region, 
versionOfClosingNode, dest,
+          transitionInZK)) {
+          LOG.debug("Sent CLOSE to " + server + " for region " + 
region.getRegionNameAsString());
           if (useZKForAssignment && !transitionInZK && state != null) {
             // Retry to make sure the region is
             // closed so as to avoid double assignment.
-            unassign(region, state, versionOfClosingNode,
-              dest, transitionInZK, src);
+            unassign(region, state, versionOfClosingNode, dest, 
transitionInZK, src);
           }
           return;
         }
         // This never happens. Currently regionserver close always return true.
         // Todo; this can now happen (0.96) if there is an exception in a 
coprocessor
-        LOG.warn("Server " + server + " region CLOSE RPC returned false for " +
-          region.getRegionNameAsString());
-      } catch (Throwable t) {
-        long sleepTime = 0;
+        LOG.warn("Server " + server + " region CLOSE RPC returned false for "
+            + region.getRegionNameAsString());
+      } catch (Throwable t) {       
         Configuration conf = this.server.getConfiguration();
         if (t instanceof RemoteException) {
-          t = ((RemoteException)t).unwrapRemoteException();
+          t = ((RemoteException) t).unwrapRemoteException();
         }
         boolean logRetries = true;
-        if (t instanceof RegionServerAbortedException
-            || t instanceof RegionServerStoppedException
+        if (t instanceof RegionServerAbortedException || t instanceof 
RegionServerStoppedException
             || t instanceof ServerNotRunningYetException) {
           // RS is aborting or stopping, we cannot offline the region since 
the region may need
-          // to do WAL recovery. Until we see  the RS expiration, we should 
retry.
+          // to do WAL recovery. Until we see the RS expiration, we should 
retry.
           sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
             RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
 
         } else if (t instanceof NotServingRegionException) {
-          LOG.debug("Offline " + region.getRegionNameAsString()
-            + ", it's not any more on " + server, t);
+          LOG.debug(
+            "Offline " + region.getRegionNameAsString() + ", it's not any more 
on " + server, t);
           if (transitionInZK) {
             deleteClosingOrClosedNode(region, server);
           }
           if (state != null) {
             regionOffline(region);
           }
           return;
-        } else if ((t instanceof FailedServerException) || (state != null &&
-            t instanceof RegionAlreadyInTransitionException)) {
-          if (t instanceof FailedServerException) {
-            sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
+        } else if ((t instanceof FailedServerException)
+            || (state != null && t instanceof 
RegionAlreadyInTransitionException)) {
+              if (t instanceof FailedServerException) {
+                sleepTime = 1L + 
conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
                   RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
-          } else {
-            // RS is already processing this region, only need to update the 
timestamp
-            LOG.debug("update " + state + " the timestamp.");
-            state.updateTimestampToNow();
-            if (maxWaitTime < 0) {
-              maxWaitTime =
-                  EnvironmentEdgeManager.currentTime()
-                      + conf.getLong(ALREADY_IN_TRANSITION_WAITTIME,
-                        DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
-            }
-            long now = EnvironmentEdgeManager.currentTime();
-            if (now < maxWaitTime) {
-              LOG.debug("Region is already in transition; "
-                + "waiting up to " + (maxWaitTime - now) + "ms", t);
-              sleepTime = 100;
-              i--; // reset the try count
-              logRetries = false;
+              } else {
+                // RS is already processing this region, only need to update 
the timestamp
+                LOG.debug("update " + state + " the timestamp.");
+                state.updateTimestampToNow();
+                if (maxWaitTime < 0) {
+                  maxWaitTime = EnvironmentEdgeManager.currentTime() + 
conf.getLong(
+                    ALREADY_IN_TRANSITION_WAITTIME, 
DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
+                }
+                long now = EnvironmentEdgeManager.currentTime();
+                if (now < maxWaitTime) {
+                  LOG.debug("Region is already in transition; " + "waiting up 
to "
+                      + (maxWaitTime - now) + "ms",
+                    t);
+                  sleepTime = 100;
+                  i--; // reset the try count
+                  logRetries = false;
+                }
+              }
             }
-          }
-        }
-
-        try {
-          if (sleepTime > 0) {
-            Thread.sleep(sleepTime);
-          }
-        } catch (InterruptedException ie) {
-          LOG.warn("Failed to unassign "
-            + region.getRegionNameAsString() + " since interrupted", ie);
-          Thread.currentThread().interrupt();
-          if (state != null) {
-            regionStates.updateRegionState(region, State.FAILED_CLOSE);
-          }
-          return;
-        }
 
         if (logRetries) {
-          LOG.info("Server " + server + " returned " + t + " for "
-            + region.getRegionNameAsString() + ", try=" + i
-            + " of " + this.maximumAttempts, t);
+          LOG.info("Server " + server + " returned " + t + " for " + 
region.getRegionNameAsString()
+              + ", try=" + i + " of " + this.maximumAttempts,
+            t);
           // Presume retry or server will expire.
         }
       }
+      // If sleepTime is not set by any of the cases, set it to sleep for

Review comment:
       nit: Can you please add a few comments on the code flow so that its 
easier for other to follow?

##########
File path: 
hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
##########
@@ -1995,97 +1995,100 @@ private void unassign(final HRegionInfo region,
         }
         return;
       }
+      long sleepTime = 0;
       try {
         // Send CLOSE RPC
-        if (serverManager.sendRegionClose(server, region,
-          versionOfClosingNode, dest, transitionInZK)) {
-          LOG.debug("Sent CLOSE to " + server + " for region " +
-            region.getRegionNameAsString());
+        if (serverManager.sendRegionClose(server, region, 
versionOfClosingNode, dest,
+          transitionInZK)) {
+          LOG.debug("Sent CLOSE to " + server + " for region " + 
region.getRegionNameAsString());
           if (useZKForAssignment && !transitionInZK && state != null) {
             // Retry to make sure the region is
             // closed so as to avoid double assignment.
-            unassign(region, state, versionOfClosingNode,
-              dest, transitionInZK, src);
+            unassign(region, state, versionOfClosingNode, dest, 
transitionInZK, src);
           }
           return;
         }
         // This never happens. Currently regionserver close always return true.
         // Todo; this can now happen (0.96) if there is an exception in a 
coprocessor
-        LOG.warn("Server " + server + " region CLOSE RPC returned false for " +
-          region.getRegionNameAsString());
-      } catch (Throwable t) {
-        long sleepTime = 0;
+        LOG.warn("Server " + server + " region CLOSE RPC returned false for "
+            + region.getRegionNameAsString());
+      } catch (Throwable t) {       
         Configuration conf = this.server.getConfiguration();
         if (t instanceof RemoteException) {
-          t = ((RemoteException)t).unwrapRemoteException();
+          t = ((RemoteException) t).unwrapRemoteException();
         }
         boolean logRetries = true;
-        if (t instanceof RegionServerAbortedException
-            || t instanceof RegionServerStoppedException
+        if (t instanceof RegionServerAbortedException || t instanceof 
RegionServerStoppedException
             || t instanceof ServerNotRunningYetException) {
           // RS is aborting or stopping, we cannot offline the region since 
the region may need
-          // to do WAL recovery. Until we see  the RS expiration, we should 
retry.
+          // to do WAL recovery. Until we see the RS expiration, we should 
retry.
           sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
             RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
 
         } else if (t instanceof NotServingRegionException) {
-          LOG.debug("Offline " + region.getRegionNameAsString()
-            + ", it's not any more on " + server, t);
+          LOG.debug(
+            "Offline " + region.getRegionNameAsString() + ", it's not any more 
on " + server, t);
           if (transitionInZK) {
             deleteClosingOrClosedNode(region, server);
           }
           if (state != null) {
             regionOffline(region);
           }
           return;
-        } else if ((t instanceof FailedServerException) || (state != null &&
-            t instanceof RegionAlreadyInTransitionException)) {
-          if (t instanceof FailedServerException) {
-            sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
+        } else if ((t instanceof FailedServerException)
+            || (state != null && t instanceof 
RegionAlreadyInTransitionException)) {
+              if (t instanceof FailedServerException) {
+                sleepTime = 1L + 
conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
                   RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
-          } else {
-            // RS is already processing this region, only need to update the 
timestamp
-            LOG.debug("update " + state + " the timestamp.");
-            state.updateTimestampToNow();
-            if (maxWaitTime < 0) {
-              maxWaitTime =
-                  EnvironmentEdgeManager.currentTime()
-                      + conf.getLong(ALREADY_IN_TRANSITION_WAITTIME,
-                        DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
-            }
-            long now = EnvironmentEdgeManager.currentTime();
-            if (now < maxWaitTime) {
-              LOG.debug("Region is already in transition; "
-                + "waiting up to " + (maxWaitTime - now) + "ms", t);
-              sleepTime = 100;
-              i--; // reset the try count
-              logRetries = false;
+              } else {
+                // RS is already processing this region, only need to update 
the timestamp
+                LOG.debug("update " + state + " the timestamp.");
+                state.updateTimestampToNow();
+                if (maxWaitTime < 0) {
+                  maxWaitTime = EnvironmentEdgeManager.currentTime() + 
conf.getLong(
+                    ALREADY_IN_TRANSITION_WAITTIME, 
DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
+                }
+                long now = EnvironmentEdgeManager.currentTime();
+                if (now < maxWaitTime) {
+                  LOG.debug("Region is already in transition; " + "waiting up 
to "
+                      + (maxWaitTime - now) + "ms",
+                    t);
+                  sleepTime = 100;
+                  i--; // reset the try count
+                  logRetries = false;
+                }
+              }
             }
-          }
-        }
-
-        try {
-          if (sleepTime > 0) {
-            Thread.sleep(sleepTime);
-          }
-        } catch (InterruptedException ie) {
-          LOG.warn("Failed to unassign "
-            + region.getRegionNameAsString() + " since interrupted", ie);
-          Thread.currentThread().interrupt();
-          if (state != null) {
-            regionStates.updateRegionState(region, State.FAILED_CLOSE);
-          }
-          return;
-        }
 
         if (logRetries) {
-          LOG.info("Server " + server + " returned " + t + " for "
-            + region.getRegionNameAsString() + ", try=" + i
-            + " of " + this.maximumAttempts, t);
+          LOG.info("Server " + server + " returned " + t + " for " + 
region.getRegionNameAsString()

Review comment:
       merge this logging and the logging in L2074 and log towards the end?

##########
File path: 
hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
##########
@@ -1995,97 +1995,100 @@ private void unassign(final HRegionInfo region,
         }
         return;
       }
+      long sleepTime = 0;
       try {
         // Send CLOSE RPC
-        if (serverManager.sendRegionClose(server, region,
-          versionOfClosingNode, dest, transitionInZK)) {
-          LOG.debug("Sent CLOSE to " + server + " for region " +
-            region.getRegionNameAsString());
+        if (serverManager.sendRegionClose(server, region, 
versionOfClosingNode, dest,
+          transitionInZK)) {
+          LOG.debug("Sent CLOSE to " + server + " for region " + 
region.getRegionNameAsString());
           if (useZKForAssignment && !transitionInZK && state != null) {
             // Retry to make sure the region is
             // closed so as to avoid double assignment.
-            unassign(region, state, versionOfClosingNode,
-              dest, transitionInZK, src);
+            unassign(region, state, versionOfClosingNode, dest, 
transitionInZK, src);
           }
           return;
         }
         // This never happens. Currently regionserver close always return true.
         // Todo; this can now happen (0.96) if there is an exception in a 
coprocessor
-        LOG.warn("Server " + server + " region CLOSE RPC returned false for " +
-          region.getRegionNameAsString());
-      } catch (Throwable t) {
-        long sleepTime = 0;
+        LOG.warn("Server " + server + " region CLOSE RPC returned false for "
+            + region.getRegionNameAsString());
+      } catch (Throwable t) {       
         Configuration conf = this.server.getConfiguration();
         if (t instanceof RemoteException) {
-          t = ((RemoteException)t).unwrapRemoteException();
+          t = ((RemoteException) t).unwrapRemoteException();
         }
         boolean logRetries = true;
-        if (t instanceof RegionServerAbortedException
-            || t instanceof RegionServerStoppedException
+        if (t instanceof RegionServerAbortedException || t instanceof 
RegionServerStoppedException
             || t instanceof ServerNotRunningYetException) {
           // RS is aborting or stopping, we cannot offline the region since 
the region may need
-          // to do WAL recovery. Until we see  the RS expiration, we should 
retry.
+          // to do WAL recovery. Until we see the RS expiration, we should 
retry.
           sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
             RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
 
         } else if (t instanceof NotServingRegionException) {
-          LOG.debug("Offline " + region.getRegionNameAsString()
-            + ", it's not any more on " + server, t);
+          LOG.debug(
+            "Offline " + region.getRegionNameAsString() + ", it's not any more 
on " + server, t);
           if (transitionInZK) {
             deleteClosingOrClosedNode(region, server);
           }
           if (state != null) {
             regionOffline(region);
           }
           return;
-        } else if ((t instanceof FailedServerException) || (state != null &&
-            t instanceof RegionAlreadyInTransitionException)) {
-          if (t instanceof FailedServerException) {
-            sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
+        } else if ((t instanceof FailedServerException)
+            || (state != null && t instanceof 
RegionAlreadyInTransitionException)) {
+              if (t instanceof FailedServerException) {
+                sleepTime = 1L + 
conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
                   RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
-          } else {
-            // RS is already processing this region, only need to update the 
timestamp
-            LOG.debug("update " + state + " the timestamp.");
-            state.updateTimestampToNow();
-            if (maxWaitTime < 0) {
-              maxWaitTime =
-                  EnvironmentEdgeManager.currentTime()
-                      + conf.getLong(ALREADY_IN_TRANSITION_WAITTIME,
-                        DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
-            }
-            long now = EnvironmentEdgeManager.currentTime();
-            if (now < maxWaitTime) {
-              LOG.debug("Region is already in transition; "
-                + "waiting up to " + (maxWaitTime - now) + "ms", t);
-              sleepTime = 100;
-              i--; // reset the try count
-              logRetries = false;
+              } else {
+                // RS is already processing this region, only need to update 
the timestamp
+                LOG.debug("update " + state + " the timestamp.");
+                state.updateTimestampToNow();
+                if (maxWaitTime < 0) {
+                  maxWaitTime = EnvironmentEdgeManager.currentTime() + 
conf.getLong(
+                    ALREADY_IN_TRANSITION_WAITTIME, 
DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
+                }
+                long now = EnvironmentEdgeManager.currentTime();
+                if (now < maxWaitTime) {
+                  LOG.debug("Region is already in transition; " + "waiting up 
to "
+                      + (maxWaitTime - now) + "ms",
+                    t);
+                  sleepTime = 100;
+                  i--; // reset the try count
+                  logRetries = false;
+                }
+              }
             }
-          }
-        }
-
-        try {
-          if (sleepTime > 0) {
-            Thread.sleep(sleepTime);
-          }
-        } catch (InterruptedException ie) {
-          LOG.warn("Failed to unassign "
-            + region.getRegionNameAsString() + " since interrupted", ie);
-          Thread.currentThread().interrupt();
-          if (state != null) {
-            regionStates.updateRegionState(region, State.FAILED_CLOSE);
-          }
-          return;
-        }
 
         if (logRetries) {
-          LOG.info("Server " + server + " returned " + t + " for "
-            + region.getRegionNameAsString() + ", try=" + i
-            + " of " + this.maximumAttempts, t);
+          LOG.info("Server " + server + " returned " + t + " for " + 
region.getRegionNameAsString()
+              + ", try=" + i + " of " + this.maximumAttempts,
+            t);
           // Presume retry or server will expire.
         }
       }
+      // If sleepTime is not set by any of the cases, set it to sleep for
+      // configured exponential backoff time
+      if (sleepTime == 0 && i != maximumAttempts) {

Review comment:
       How about a more readable condition here?
   
   if (regionCloseFailed || anyOtherExceptionThrown) { // set these flags above
   
   }
   
   I think that makes the flow easier to understand.

##########
File path: 
hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
##########
@@ -1995,97 +1995,100 @@ private void unassign(final HRegionInfo region,
         }
         return;
       }
+      long sleepTime = 0;
       try {
         // Send CLOSE RPC
-        if (serverManager.sendRegionClose(server, region,
-          versionOfClosingNode, dest, transitionInZK)) {
-          LOG.debug("Sent CLOSE to " + server + " for region " +
-            region.getRegionNameAsString());
+        if (serverManager.sendRegionClose(server, region, 
versionOfClosingNode, dest,
+          transitionInZK)) {
+          LOG.debug("Sent CLOSE to " + server + " for region " + 
region.getRegionNameAsString());
           if (useZKForAssignment && !transitionInZK && state != null) {
             // Retry to make sure the region is
             // closed so as to avoid double assignment.
-            unassign(region, state, versionOfClosingNode,
-              dest, transitionInZK, src);
+            unassign(region, state, versionOfClosingNode, dest, 
transitionInZK, src);
           }
           return;
         }
         // This never happens. Currently regionserver close always return true.
         // Todo; this can now happen (0.96) if there is an exception in a 
coprocessor
-        LOG.warn("Server " + server + " region CLOSE RPC returned false for " +
-          region.getRegionNameAsString());
-      } catch (Throwable t) {
-        long sleepTime = 0;
+        LOG.warn("Server " + server + " region CLOSE RPC returned false for "
+            + region.getRegionNameAsString());
+      } catch (Throwable t) {       
         Configuration conf = this.server.getConfiguration();
         if (t instanceof RemoteException) {
-          t = ((RemoteException)t).unwrapRemoteException();
+          t = ((RemoteException) t).unwrapRemoteException();
         }
         boolean logRetries = true;
-        if (t instanceof RegionServerAbortedException
-            || t instanceof RegionServerStoppedException
+        if (t instanceof RegionServerAbortedException || t instanceof 
RegionServerStoppedException
             || t instanceof ServerNotRunningYetException) {
           // RS is aborting or stopping, we cannot offline the region since 
the region may need
-          // to do WAL recovery. Until we see  the RS expiration, we should 
retry.
+          // to do WAL recovery. Until we see the RS expiration, we should 
retry.
           sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
             RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
 
         } else if (t instanceof NotServingRegionException) {
-          LOG.debug("Offline " + region.getRegionNameAsString()
-            + ", it's not any more on " + server, t);
+          LOG.debug(
+            "Offline " + region.getRegionNameAsString() + ", it's not any more 
on " + server, t);
           if (transitionInZK) {
             deleteClosingOrClosedNode(region, server);
           }
           if (state != null) {
             regionOffline(region);
           }
           return;
-        } else if ((t instanceof FailedServerException) || (state != null &&
-            t instanceof RegionAlreadyInTransitionException)) {
-          if (t instanceof FailedServerException) {
-            sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
+        } else if ((t instanceof FailedServerException)
+            || (state != null && t instanceof 
RegionAlreadyInTransitionException)) {
+              if (t instanceof FailedServerException) {
+                sleepTime = 1L + 
conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
                   RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
-          } else {
-            // RS is already processing this region, only need to update the 
timestamp
-            LOG.debug("update " + state + " the timestamp.");
-            state.updateTimestampToNow();
-            if (maxWaitTime < 0) {
-              maxWaitTime =
-                  EnvironmentEdgeManager.currentTime()
-                      + conf.getLong(ALREADY_IN_TRANSITION_WAITTIME,
-                        DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
-            }
-            long now = EnvironmentEdgeManager.currentTime();
-            if (now < maxWaitTime) {
-              LOG.debug("Region is already in transition; "
-                + "waiting up to " + (maxWaitTime - now) + "ms", t);
-              sleepTime = 100;
-              i--; // reset the try count
-              logRetries = false;
+              } else {
+                // RS is already processing this region, only need to update 
the timestamp
+                LOG.debug("update " + state + " the timestamp.");
+                state.updateTimestampToNow();
+                if (maxWaitTime < 0) {
+                  maxWaitTime = EnvironmentEdgeManager.currentTime() + 
conf.getLong(
+                    ALREADY_IN_TRANSITION_WAITTIME, 
DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
+                }
+                long now = EnvironmentEdgeManager.currentTime();
+                if (now < maxWaitTime) {
+                  LOG.debug("Region is already in transition; " + "waiting up 
to "
+                      + (maxWaitTime - now) + "ms",
+                    t);
+                  sleepTime = 100;
+                  i--; // reset the try count
+                  logRetries = false;
+                }
+              }
             }
-          }
-        }
-
-        try {
-          if (sleepTime > 0) {
-            Thread.sleep(sleepTime);
-          }
-        } catch (InterruptedException ie) {
-          LOG.warn("Failed to unassign "
-            + region.getRegionNameAsString() + " since interrupted", ie);
-          Thread.currentThread().interrupt();
-          if (state != null) {
-            regionStates.updateRegionState(region, State.FAILED_CLOSE);
-          }
-          return;
-        }
 
         if (logRetries) {
-          LOG.info("Server " + server + " returned " + t + " for "
-            + region.getRegionNameAsString() + ", try=" + i
-            + " of " + this.maximumAttempts, t);
+          LOG.info("Server " + server + " returned " + t + " for " + 
region.getRegionNameAsString()
+              + ", try=" + i + " of " + this.maximumAttempts,
+            t);
           // Presume retry or server will expire.
         }
       }
+      // If sleepTime is not set by any of the cases, set it to sleep for
+      // configured exponential backoff time
+      if (sleepTime == 0 && i != maximumAttempts) {
+        sleepTime = backoffPolicy.getBackoffTime(retryConfig, i);
+        LOG.info("Waiting for " + sleepTime + "milliseconds exponential 
backoff time for "
+            + region.getRegionNameAsString() + " before next retry " + (i + 1) 
+ " of "
+            + this.maximumAttempts);
+      }
+      try {
+        if (sleepTime > 0 && i != maximumAttempts) {
+          Thread.sleep(sleepTime);
+        }
+      } catch (InterruptedException ie) {
+        LOG.warn("Failed to unassign " + region.getRegionNameAsString() + " 
since interrupted", ie);
+        Thread.currentThread().interrupt();
+        if (state != null) {

Review comment:
       do this before propagating the interrupt flag?

##########
File path: 
hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
##########
@@ -598,6 +601,68 @@ public void testCloseFailed() throws Exception {
     }
   }
 
+  /**
+   * This tests region close with exponential backoff
+   */
+  @Test(timeout = 60000)
+  public void testCloseRegionWithExponentialBackOff() throws Exception {
+    String table = "testCloseRegionWithExponentialBackOff";
+    // Set the backoff time between each retry for failed close
+    
TEST_UTIL.getMiniHBaseCluster().getConf().setLong("hbase.assignment.retry.sleep.initial",
 1000);
+    HMaster activeMaster = TEST_UTIL.getHBaseCluster().getMaster();
+    TEST_UTIL.getMiniHBaseCluster().stopMaster(activeMaster.getServerName());
+    TEST_UTIL.getMiniHBaseCluster().startMaster(); // restart the master for 
conf take into affect
+
+    try {
+      ScheduledThreadPoolExecutor scheduledThreadPoolExecutor =
+          new ScheduledThreadPoolExecutor(1, 
Threads.newDaemonThreadFactory("ExponentialBackOff"));
+
+      HTableDescriptor desc = new HTableDescriptor(TableName.valueOf(table));
+      desc.addFamily(new HColumnDescriptor(FAMILY));
+      admin.createTable(desc);
+
+      Table meta = new HTable(conf, TableName.META_TABLE_NAME);
+      HRegionInfo hri =
+          new HRegionInfo(desc.getTableName(), Bytes.toBytes("A"), 
Bytes.toBytes("Z"));
+      MetaTableAccessor.addRegionToMeta(meta, hri);
+
+      HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
+      AssignmentManager am = master.getAssignmentManager();
+      assertTrue(TEST_UTIL.assignRegion(hri));
+      ServerName sn = am.getRegionStates().getRegionServerOfRegion(hri);
+      TEST_UTIL.assertRegionOnServer(hri, sn, 6000);
+
+      MyRegionObserver.preCloseEnabled.set(true);
+      // Unset the precloseEnabled flag after 1 second for next retry to 
succeed
+      scheduledThreadPoolExecutor.schedule(new Runnable() {
+        @Override
+        public void run() {
+          MyRegionObserver.preCloseEnabled.set(false);

Review comment:
       Does the test reliably fail without the patch, meaning it ends up in 
FAILED_CLOSE?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hbase] bharathv commented on a change in pull request #1755: HBASE-24069 Provide an ExponentialBackOffPolicy sleep between failed …

Reply via email to