lianetm commented on code in PR #14364: URL: https://github.com/apache/kafka/pull/14364#discussion_r1335304701
########## clients/src/main/java/org/apache/kafka/clients/consumer/internals/HeartbeatRequestManager.java: ########## @@ -0,0 +1,294 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.clients.consumer.internals; + +import org.apache.kafka.clients.CommonClientConfigs; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.common.KafkaException; +import org.apache.kafka.common.errors.GroupAuthorizationException; +import org.apache.kafka.common.message.ConsumerGroupHeartbeatRequestData; +import org.apache.kafka.common.protocol.Errors; +import org.apache.kafka.common.requests.ConsumerGroupHeartbeatRequest; +import org.apache.kafka.common.requests.ConsumerGroupHeartbeatResponse; +import org.apache.kafka.common.utils.LogContext; +import org.apache.kafka.common.utils.Time; +import org.apache.kafka.common.utils.Timer; +import org.slf4j.Logger; + +import java.util.ArrayList; +import java.util.Collections; + +/** + * Manages the request creation and response handling for the heartbeat. The module creates a {@link ConsumerGroupHeartbeatRequest} + * using the state stored in the {@link MembershipManager} and enqueue it to the network queue to be sent out. Once + * the response is received, the module will update the state in the {@link MembershipManager} and handle any errors. + * + * The manager only emits heartbeat when the member is in a group, tries to join a group, or tries rejoin the group. + * If the member does not have groupId configured, left the group, or encountering fatal exceptions, the heartbeat will + * not be sent. If the coordinator not is not found, we will skip sending the heartbeat and tries to find a coordinator first. + * + * If the heartbeat failed due to retriable errors, such as, TimeoutException. The subsequent attempt will be backoff + * exponentially. + * + * If the member completes the partition revocation process, a heartbeat request will be sent in the next event loop. + * + * {@link HeartbeatRequestState} for more details. + */ +public class HeartbeatRequestManager implements RequestManager { + private final Logger logger; + + private final int rebalanceTimeoutMs; + + private final CoordinatorRequestManager coordinatorRequestManager; + private final SubscriptionState subscriptions; + private final HeartbeatRequestState heartbeatRequestState; + private final MembershipManager membershipManager; + private final ErrorEventHandler nonRetriableErrorHandler; + + public HeartbeatRequestManager( + final Time time, + final LogContext logContext, + final ConsumerConfig config, + final CoordinatorRequestManager coordinatorRequestManager, + final SubscriptionState subscriptions, + final MembershipManager membershipManager, + final ErrorEventHandler nonRetriableErrorHandler) { + this.coordinatorRequestManager = coordinatorRequestManager; + this.logger = logContext.logger(getClass()); + this.subscriptions = subscriptions; + this.membershipManager = membershipManager; + this.nonRetriableErrorHandler = nonRetriableErrorHandler; + this.rebalanceTimeoutMs = config.getInt(CommonClientConfigs.MAX_POLL_INTERVAL_MS_CONFIG); + long retryBackoffMs = config.getLong(ConsumerConfig.RETRY_BACKOFF_MS_CONFIG); + long retryBackoffMaxMs = config.getLong(ConsumerConfig.RETRY_BACKOFF_MAX_MS_CONFIG); + this.heartbeatRequestState = new HeartbeatRequestState(logContext, time, 0, retryBackoffMs, + retryBackoffMaxMs, rebalanceTimeoutMs); + } + + // Visible for testing + HeartbeatRequestManager( + final Time time, + final LogContext logContext, + final ConsumerConfig config, + final CoordinatorRequestManager coordinatorRequestManager, + final SubscriptionState subscriptions, + final MembershipManager membershipManager, + final HeartbeatRequestState heartbeatRequestState, + final ErrorEventHandler nonRetriableErrorHandler) { + this.logger = logContext.logger(this.getClass()); + this.subscriptions = subscriptions; + this.rebalanceTimeoutMs = config.getInt(CommonClientConfigs.MAX_POLL_INTERVAL_MS_CONFIG); + this.coordinatorRequestManager = coordinatorRequestManager; + this.heartbeatRequestState = heartbeatRequestState; + this.membershipManager = membershipManager; + this.nonRetriableErrorHandler = nonRetriableErrorHandler; + } + + @Override + public NetworkClientDelegate.PollResult poll(long currentTimeMs) { + if (!coordinatorRequestManager.coordinator().isPresent() || !membershipManager.shouldSendHeartbeat()) { + return new NetworkClientDelegate.PollResult( + Long.MAX_VALUE, Collections.emptyList()); + } + + // TODO: We will need to send a heartbeat response after partitions being revoke. This needs to be + // implemented either with or after the partition reconciliation logic. + if (!heartbeatRequestState.canSendRequest(currentTimeMs)) { + return new NetworkClientDelegate.PollResult( + heartbeatRequestState.nextHeartbeatMs(currentTimeMs), + Collections.emptyList()); + } + this.heartbeatRequestState.onSendAttempt(currentTimeMs); + NetworkClientDelegate.UnsentRequest request = makeHeartbeatRequest(); + // return Long.MAX_VALUE because we will update the timer when the response is received + return new NetworkClientDelegate.PollResult(Long.MAX_VALUE, Collections.singletonList(request)); + } + + private NetworkClientDelegate.UnsentRequest makeHeartbeatRequest() { + ConsumerGroupHeartbeatRequestData data = new ConsumerGroupHeartbeatRequestData() + .setGroupId(membershipManager.groupId()) + .setMemberEpoch(membershipManager.memberEpoch()) + .setMemberId(membershipManager.memberId()) + .setRebalanceTimeoutMs(rebalanceTimeoutMs); + + membershipManager.groupInstanceId().ifPresent(data::setInstanceId); + + if (this.subscriptions.hasPatternSubscription()) { + // We haven't discsussed how Regex is stored in the consumer. We could do it in the subscriptionState + // , in the memberStateManager, or here. + // data.setSubscribedTopicRegex(regex) + } else { + data.setSubscribedTopicNames(new ArrayList<>(this.subscriptions.subscription())); + } + + this.membershipManager.assignorSelection().serverAssignor().ifPresent(data::setServerAssignor); + + NetworkClientDelegate.UnsentRequest request = new NetworkClientDelegate.UnsentRequest( + new ConsumerGroupHeartbeatRequest.Builder(data), + coordinatorRequestManager.coordinator()); + + request.future().whenComplete((response, exception) -> { + if (exception == null) { + onResponse((ConsumerGroupHeartbeatResponse) response.responseBody(), response.receivedTimeMs()); + } else { + onFailure(exception, response.receivedTimeMs()); + } + }); + return request; + } + + private void onFailure(final Throwable exception, final long responseTimeMs) { + this.heartbeatRequestState.onFailedAttempt(responseTimeMs); + logger.warn("Failed to send heartbeat to coordinator node {} due to error: {}", + coordinatorRequestManager.coordinator(), exception.getMessage()); + } + + private void onResponse(final ConsumerGroupHeartbeatResponse response, long currentTimeMs) { + if (Errors.forCode(response.data().errorCode()) == Errors.NONE) { + this.heartbeatRequestState.updateHeartbeatIntervalMs(response.data().heartbeatIntervalMs()); + this.heartbeatRequestState.onSuccessfulAttempt(currentTimeMs); + this.heartbeatRequestState.resetTimer(); + try { + membershipManager.updateState(response.data()); + } catch (KafkaException e) { + logger.error("Received unexpected error in heartbeat response: {}", e.getMessage()); + } + return; + } + + onErrorResponse(response, currentTimeMs); + } + + private void onErrorResponse(final ConsumerGroupHeartbeatResponse response, + final long currentTimeMs) { + this.heartbeatRequestState.onFailedAttempt(currentTimeMs); + Errors error = Errors.forCode(response.data().errorCode()); + if (error == Errors.NOT_COORDINATOR || error == Errors.COORDINATOR_NOT_AVAILABLE) { + String errorMessage = String.format("Coordinator node {} is either not started or not valid. Retrying", + coordinatorRequestManager); + logInfo(errorMessage, response, currentTimeMs); + coordinatorRequestManager.markCoordinatorUnknown(response.data().errorMessage(), currentTimeMs); + } else if (error == Errors.COORDINATOR_LOAD_IN_PROGRESS) { + // retry + String errorMessage = String.format("Heartbeat was not successful because the coordinator node {} is loading. Retrying", + coordinatorRequestManager.coordinator()); + logInfo(errorMessage, response, currentTimeMs); + } else { + onFatalErrorResponse(response); + } + } + + private void onFatalErrorResponse(final ConsumerGroupHeartbeatResponse response) { + final Errors responseError = Errors.forCode(response.data().errorCode()); + if (responseError == Errors.GROUP_AUTHORIZATION_FAILED) { Review Comment: This is a non-retriable exception, so I expect we should be calling `membershipManager.transitionToFailure()`? (same for all other fatal exceptions up to the `UNRELEASED_INSTANCE_ID`, which is properly doing the transition) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: jira-unsubscr...@kafka.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org