dajac commented on code in PR #13638: URL: https://github.com/apache/kafka/pull/13638#discussion_r1183280551
########## group-coordinator/src/main/java/org/apache/kafka/coordinator/group/consumer/CurrentAssignmentBuilder.java: ########## @@ -0,0 +1,415 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.coordinator.group.consumer; + +import org.apache.kafka.common.Uuid; +import org.apache.kafka.common.message.ConsumerGroupHeartbeatRequestData; + +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.function.BiFunction; + +/** + * The CurrentAssignmentBuilder class encapsulates the reconciliation engine of the + * consumer group protocol. Given the current state of a member and a desired or target + * assignment state, the state machine takes the necessary steps to converge them. + * + * The member state has the following properties: + * - Current Epoch - The current epoch of the member. + * - Next Epoch - The desired epoch of the member. It corresponds to the epoch of + * the target/desired assignment. The member transitions to this epoch + * when it has revoked the partitions that it does not own or if it + * does not have to revoke any. + * - Previous Epoch - The previous epoch of the member when the state was updated. + * - Assigned Set - The set of partitions currently assigned to the member. This represents what + * the member should have. + * - Revoking Set - The set of partitions that the member should revoke before it can transition + * to the next state. + * - Assigning Set - The set of partitions that the member will eventually receive. The partitions + * in this set are still owned by other members in the group. + * + * The state machine has four states: + * - NEW_TARGET_ASSIGNMENT - This is the initial state of the state machine. The state machine starts + * here when the next epoch does not match the target epoch. It means that + * a new target assignment has been installed so the reconciliation process + * must restart. In this state, the Assigned, Revoking and Assigning sets + * are computed. If Revoking is not empty, the state machine transitions + * to REVOKE; if Assigning is not empty, it transitions to ASSIGNING; + * otherwise it transitions to STABLE. + * - REVOKE - This state means that the member must revoke partitions before it can + * transition to the next epoch and thus start receiving new partitions. + * The member transitions to the next state only when it has acknowledged + * the revocation. + * - ASSIGNING - This state means that the member waits on partitions which are still + * owned by other members in the group. It remains in this state until + * they are all freed up. + * - STABLE - This state means that the member has received all its assigned partitions. + */ +public class CurrentAssignmentBuilder { + /** + * The consumer group member which is reconciled. + */ + private final ConsumerGroupMember member; + + /** + * The target assignment epoch. + */ + private int targetAssignmentEpoch; + + /** + * The target assignment. + */ + private Assignment targetAssignment; + + /** + * A function which returns the current epoch of a topic-partition or -1 if the + * topic-partition is not assigned. The current epoch is the epoch of the current owner. + */ + private BiFunction<Uuid, Integer, Integer> currentPartitionEpoch; + + /** + * The partitions owned by the consumer. This is directly provided by the member in the + * ConsumerGroupHeartbeat request. + */ + private List<ConsumerGroupHeartbeatRequestData.TopicPartitions> ownedTopicPartitions; + + /** + * Constructs the CurrentAssignmentBuilder based on the current state of the + * provided consumer group member. + * + * @param member The consumer group member that must be reconciled. + */ + public CurrentAssignmentBuilder(ConsumerGroupMember member) { + this.member = Objects.requireNonNull(member); + } + + /** + * Sets the target assignment epoch and the target assignment that the + * consumer group member must be reconciled to. + * + * @param targetAssignmentEpoch The target assignment epoch. + * @param targetAssignment The target assignment. + * @return This object. + */ + public CurrentAssignmentBuilder withTargetAssignment( + int targetAssignmentEpoch, + Assignment targetAssignment + ) { + this.targetAssignmentEpoch = targetAssignmentEpoch; + this.targetAssignment = Objects.requireNonNull(targetAssignment); + return this; + } + + /** + * Sets a BiFunction which allows to retrieve the current epoch of a + * partition. This is used by the state machine to determine if a + * partition is free or still used by another member. + * + * @param currentPartitionEpoch A BiFunction which gets the epoch of a + * topic id / partitions id pair. + * @return This object. + */ + public CurrentAssignmentBuilder withCurrentPartitionEpoch( + BiFunction<Uuid, Integer, Integer> currentPartitionEpoch + ) { + this.currentPartitionEpoch = Objects.requireNonNull(currentPartitionEpoch); + return this; + } + + /** + * Sets the partitions currently owned by the member. This comes directly + * from the last ConsumerGroupHeartbeat request. This is used to determine + * if the member has revoked the necessary partitions. + * + * @param ownedTopicPartitions A list of topic-partitions. + * @return This object. + */ + public CurrentAssignmentBuilder withOwnedTopicPartitions( + List<ConsumerGroupHeartbeatRequestData.TopicPartitions> ownedTopicPartitions + ) { + this.ownedTopicPartitions = ownedTopicPartitions; + return this; + } + + /** + * Builds the next state for the member or keep the current one if it + * is not possible to move forward with the current state. + * + * @return A new ConsumerGroupMember or the current one. + */ + public ConsumerGroupMember build() { + // A new target assignment has been installed, we need to restart + // the reconciliation loop from the beginning. + if (targetAssignmentEpoch != member.nextMemberEpoch()) { + return transitionToInitialState(); + } + + switch (member.state()) { + // Check if the partitions have been revoked by the member. + case REVOKING: + return maybeTransitionFromRevokingToAssigningOrStable(); + + // Check if pending partitions have been freed up. + case ASSIGNING: + return maybeTransitionFromAssigningToAssigningOrStable(); + + // Nothing to do. + case STABLE: + return member; + } + + return member; + } + + /** + * Transitions to the initial state. Here we compute the Assigned, + * Revoking and Assigning sets. + * + * @return A new ConsumerGroupMember. + */ + private ConsumerGroupMember transitionToInitialState() { + Map<Uuid, Set<Integer>> newAssignedSet = new HashMap<>(); + Map<Uuid, Set<Integer>> newRevokingSet = new HashMap<>(); + Map<Uuid, Set<Integer>> newAssigningSet = new HashMap<>(); + + // Compute the combined set of topics. + Set<Uuid> allTopicIds = new HashSet<>(targetAssignment.partitions().keySet()); + allTopicIds.addAll(member.assignedPartitions().keySet()); + allTopicIds.addAll(member.partitionsPendingRevocation().keySet()); + allTopicIds.addAll(member.partitionsPendingAssignment().keySet()); + + for (Uuid topicId : allTopicIds) { + Set<Integer> target = targetAssignment.partitions() + .getOrDefault(topicId, Collections.emptySet()); + Set<Integer> currentAssignedPartitions = member.assignedPartitions() + .getOrDefault(topicId, Collections.emptySet()); + Set<Integer> currentRevokingPartitions = member.partitionsPendingRevocation() + .getOrDefault(topicId, Collections.emptySet()); + + // Assigned_1 = (Assigned_0 + Revoking_0) /\ Target + Set<Integer> newAssignedPartitions = new HashSet<>(currentAssignedPartitions); + newAssignedPartitions.addAll(currentRevokingPartitions); + newAssignedPartitions.retainAll(target); + + // Revoking_1 = (Assigned_0 + Revoking_0) - Assigned_1 + Set<Integer> newRevokingPartitions = new HashSet<>(currentAssignedPartitions); + newRevokingPartitions.addAll(currentRevokingPartitions); + newRevokingPartitions.removeAll(newAssignedPartitions); + + // Assigning_1 = Target - Assigned_1 + Set<Integer> newAssigningPartitions = new HashSet<>(target); + newAssigningPartitions.removeAll(newAssignedPartitions); + + if (!newAssignedPartitions.isEmpty()) { + newAssignedSet.put(topicId, newAssignedPartitions); + } + + if (!newRevokingPartitions.isEmpty()) { + newRevokingSet.put(topicId, newRevokingPartitions); + } + + if (!newAssigningPartitions.isEmpty()) { + newAssigningSet.put(topicId, newAssigningPartitions); + } + } + + if (!newRevokingSet.isEmpty()) { + // If the revoking set is not empty, we transition to Revoking and we + // stay in the current epoch. + return new ConsumerGroupMember.Builder(member) + .setAssignedPartitions(newAssignedSet) + .setPartitionsPendingRevocation(newRevokingSet) + .setPartitionsPendingAssignment(newAssigningSet) + .setNextMemberEpoch(targetAssignmentEpoch) + .build(); + } else { + if (!newAssigningSet.isEmpty()) { + // If the assigning set is not empty, we check if some or all + // partitions are free to use. If they are, we move them to + // the assigned set. + maybeAssignPendingPartitions(newAssignedSet, newAssigningSet); + } + + // We transition to the target epoch. If the assigning set is empty, + // the member transition to stable, otherwise to assigning. + return new ConsumerGroupMember.Builder(member) + .setAssignedPartitions(newAssignedSet) + .setPartitionsPendingRevocation(Collections.emptyMap()) + .setPartitionsPendingAssignment(newAssigningSet) + .setPreviousMemberEpoch(member.memberEpoch()) + .setMemberEpoch(targetAssignmentEpoch) + .setNextMemberEpoch(targetAssignmentEpoch) + .build(); + } + } + + /** + * Tries to transition from Revoke to Assigning or Stable. This is only + * possible when the member acknowledges that it only owns the partition + * in the Assigned set. + * + * @return A new ConsumerGroupMember with the new state or the current one + * if the member stays in the current state. + */ + private ConsumerGroupMember maybeTransitionFromRevokingToAssigningOrStable() { + if (member.partitionsPendingRevocation().isEmpty() || hasRevokedAllPartitions(ownedTopicPartitions)) { + Map<Uuid, Set<Integer>> newAssignedSet = deepCopy(member.assignedPartitions()); + Map<Uuid, Set<Integer>> newAssigningSet = deepCopy(member.partitionsPendingAssignment()); + + if (!newAssigningSet.isEmpty()) { + // If the assigning set is not empty, we check if some or all + // partitions are free to use. If they are, we move them to + // the assigned set. + maybeAssignPendingPartitions(newAssignedSet, newAssigningSet); + } + + // We transition to the target epoch. If the assigning set is empty, + // the member transition to stable, otherwise to assigning. + return new ConsumerGroupMember.Builder(member) + .setAssignedPartitions(newAssignedSet) + .setPartitionsPendingRevocation(Collections.emptyMap()) + .setPartitionsPendingAssignment(newAssigningSet) + .setPreviousMemberEpoch(member.memberEpoch()) + .setMemberEpoch(targetAssignmentEpoch) + .setNextMemberEpoch(targetAssignmentEpoch) + .build(); + } else { + return member; + } + } + + /** + * Tries to transition from Assigning to Assigning or Stable. This is only + * possible when one or more partitions in the Assigning set have been freed + * up by other members in the group. + * + * @return A new ConsumerGroupMember with the new state or the current one + * if the member stays in the current state. + */ + private ConsumerGroupMember maybeTransitionFromAssigningToAssigningOrStable() { + Map<Uuid, Set<Integer>> newAssignedSet = deepCopy(member.assignedPartitions()); Review Comment: Which part are your referring to? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: jira-unsubscr...@kafka.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org