Re: [PR] KAFKA-15950: Serialize broker heartbeat requests [kafka]

via GitHub Mon, 04 Dec 2023 15:32:59 -0800


soarez commented on code in PR #14903:
URL: https://github.com/apache/kafka/pull/14903#discussion_r1414633539



##########
core/src/main/scala/kafka/server/BrokerLifecycleManager.scala:
##########
@@ -453,79 +490,73 @@ class BrokerLifecycleManager(
         val message = 
response.responseBody().asInstanceOf[BrokerHeartbeatResponse]
         val errorCode = Errors.forCode(message.data().errorCode())
         if (errorCode == Errors.NONE) {
-          // this response handler is not invoked from the event handler 
thread,
-          // and processing a successful heartbeat response requires updating
-          // state, so to continue we need to schedule an event
-          eventQueue.prepend(new BrokerHeartbeatResponseEvent(message.data()))
+          val responseData = message.data()
+          failedAttempts = 0
+          _state match {
+            case BrokerState.STARTING =>
+              if (responseData.isCaughtUp) {
+                info(s"The broker has caught up. Transitioning from STARTING 
to RECOVERY.")
+                _state = BrokerState.RECOVERY
+                initialCatchUpFuture.complete(null)
+              } else {
+                debug(s"The broker is STARTING. Still waiting to catch up with 
cluster metadata.")
+              }
+              // Schedule the heartbeat after only 10 ms so that in the case 
where
+              // there is no recovery work to be done, we start up a bit 
quicker.
+              scheduleNextCommunication(NANOSECONDS.convert(10, MILLISECONDS))
+            case BrokerState.RECOVERY =>
+              if (!responseData.isFenced) {
+                info(s"The broker has been unfenced. Transitioning from 
RECOVERY to RUNNING.")
+                initialUnfenceFuture.complete(null)
+                _state = BrokerState.RUNNING
+              } else {
+                info(s"The broker is in RECOVERY.")
+              }
+              scheduleNextCommunicationAfterSuccess()
+            case BrokerState.RUNNING =>
+              debug(s"The broker is RUNNING. Processing heartbeat response.")
+              scheduleNextCommunicationAfterSuccess()
+            case BrokerState.PENDING_CONTROLLED_SHUTDOWN =>
+              if (!responseData.shouldShutDown()) {
+                info(s"The broker is in PENDING_CONTROLLED_SHUTDOWN state, 
still waiting " +
+                  "for the active controller.")
+                if (!gotControlledShutdownResponse) {
+                  // If this is the first pending controlled shutdown response 
we got,
+                  // schedule our next heartbeat a little bit sooner than we 
usually would.
+                  // In the case where controlled shutdown completes quickly, 
this will
+                  // speed things up a little bit.
+                  scheduleNextCommunication(NANOSECONDS.convert(50, 
MILLISECONDS))
+                } else {
+                  scheduleNextCommunicationAfterSuccess()
+                }
+              } else {
+                info(s"The controller has asked us to exit controlled 
shutdown.")
+                beginShutdown()
+              }
+              gotControlledShutdownResponse = true
+            case BrokerState.SHUTTING_DOWN =>
+              info(s"The broker is SHUTTING_DOWN. Ignoring heartbeat 
response.")
+            case _ =>
+              error(s"Unexpected broker state ${_state}")
+              scheduleNextCommunicationAfterSuccess()
+          }
         } else {
           warn(s"Broker $nodeId sent a heartbeat request but received error 
$errorCode.")
           scheduleNextCommunicationAfterFailure()
         }
       }
     }
-
-    override def onTimeout(): Unit = {
-      info("Unable to send a heartbeat because the RPC got timed out before it 
could be sent.")
-      scheduleNextCommunicationAfterFailure()
-    }
   }
 
-  private class BrokerHeartbeatResponseEvent(response: 
BrokerHeartbeatResponseData) extends EventQueue.Event {
-    override def run(): Unit = {
-      failedAttempts = 0
-      _state match {
-        case BrokerState.STARTING =>
-          if (response.isCaughtUp) {
-            info(s"The broker has caught up. Transitioning from STARTING to 
RECOVERY.")
-            _state = BrokerState.RECOVERY
-            initialCatchUpFuture.complete(null)
-          } else {
-            debug(s"The broker is STARTING. Still waiting to catch up with 
cluster metadata.")
-          }
-          // Schedule the heartbeat after only 10 ms so that in the case where
-          // there is no recovery work to be done, we start up a bit quicker.
-          scheduleNextCommunication(NANOSECONDS.convert(10, MILLISECONDS))
-        case BrokerState.RECOVERY =>
-          if (!response.isFenced) {
-            info(s"The broker has been unfenced. Transitioning from RECOVERY 
to RUNNING.")
-            initialUnfenceFuture.complete(null)
-            _state = BrokerState.RUNNING
-          } else {
-            info(s"The broker is in RECOVERY.")
-          }
-          scheduleNextCommunicationAfterSuccess()
-        case BrokerState.RUNNING =>
-          debug(s"The broker is RUNNING. Processing heartbeat response.")
-          scheduleNextCommunicationAfterSuccess()
-        case BrokerState.PENDING_CONTROLLED_SHUTDOWN =>
-          if (!response.shouldShutDown()) {
-            info(s"The broker is in PENDING_CONTROLLED_SHUTDOWN state, still 
waiting " +
-              "for the active controller.")
-            if (!gotControlledShutdownResponse) {
-              // If this is the first pending controlled shutdown response we 
got,
-              // schedule our next heartbeat a little bit sooner than we 
usually would.
-              // In the case where controlled shutdown completes quickly, this 
will
-              // speed things up a little bit.
-              scheduleNextCommunication(NANOSECONDS.convert(50, MILLISECONDS))
-            } else {
-              scheduleNextCommunicationAfterSuccess()
-            }
-          } else {
-            info(s"The controller has asked us to exit controlled shutdown.")
-            beginShutdown()
-          }
-          gotControlledShutdownResponse = true
-        case BrokerState.SHUTTING_DOWN =>
-          info(s"The broker is SHUTTING_DOWN. Ignoring heartbeat response.")
-        case _ =>
-          error(s"Unexpected broker state ${_state}")
-          scheduleNextCommunicationAfterSuccess()
-      }
+  private def scheduleNextCommunicationImmediately(): Unit = {
+    if (communicationInFlight) {

Review Comment:
   You are correct, thanks for finding this. I placed the 
`communicationInFlight` check in `scheduleNextCommunicationImmediately`, but 
`BrokerHeartbeatResponseEvent` bypasses that by calling 
`scheduleNextCommunication` directly.
   
   I'm moving the check to `scheduleNextCommunication` so we can catch all 
cases. Since `CommunicationEvent` is only scheduled from this function, we can 
make sure to not schedule it when a request is in flight. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: jira-unsubscr...@kafka.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] KAFKA-15950: Serialize broker heartbeat requests [kafka]

Reply via email to