Re: [PR] KAFKA-15784: Ensure atomicity of in memory update and write when transactionally committing offsets [kafka]

via GitHub Mon, 04 Dec 2023 16:01:01 -0800


artemlivshits commented on code in PR #14774:
URL: https://github.com/apache/kafka/pull/14774#discussion_r1414599997



##########
core/src/main/scala/kafka/server/KafkaApis.scala:
##########
@@ -708,23 +708,40 @@ class KafkaApis(val requestChannel: RequestChannel,
       }
     }
 
-    if (authorizedRequestInfo.isEmpty)
-      sendResponseCallback(Map.empty)
-    else {
-      val internalTopicsAllowed = request.header.clientId == 
AdminUtils.ADMIN_CLIENT_ID
+    val internalTopicsAllowed = request.header.clientId == 
AdminUtils.ADMIN_CLIENT_ID
+    val transactionVerificationEntries = new 
ReplicaManager.TransactionVerificationEntries
 
-      // call the replica manager to append messages to the replicas
+    def postVerificationCallback(newRequestLocal: RequestLocal)

Review Comment:
   Do we need to change this file?  I think we could preserve the abstraction 
and keep the verification guts encapsulated in the async call (as it is now), 
and only let the group coordinator use the explicit stages.
   
   I.e. replicaManager.appendRecords would do what it does now (under the 
covers call the stages), then we don't have to bring the complexity into the 
code that works well with the abstraction.



##########
core/src/main/scala/kafka/coordinator/group/GroupMetadataManager.scala:
##########
@@ -349,146 +468,68 @@ class GroupMetadataManager(brokerId: Int,
                    consumerId: String,
                    offsetMetadata: immutable.Map[TopicIdPartition, 
OffsetAndMetadata],
                    responseCallback: immutable.Map[TopicIdPartition, Errors] 
=> Unit,
-                   transactionalId: String = null,
                    producerId: Long = RecordBatch.NO_PRODUCER_ID,
                    producerEpoch: Short = RecordBatch.NO_PRODUCER_EPOCH,
                    requestLocal: RequestLocal = RequestLocal.NoCaching): Unit 
= {
-    // first filter out partitions with offset metadata size exceeding limit
-    val filteredOffsetMetadata = offsetMetadata.filter { case (_, 
offsetAndMetadata) =>
-      validateOffsetMetadataLength(offsetAndMetadata.metadata)
-    }
-
     group.inLock {
       if (!group.hasReceivedConsistentOffsetCommits)
         warn(s"group: ${group.groupId} with leader: ${group.leaderOrNull} has 
received offset commits from consumers as well " +
           s"as transactional producers. Mixing both types of offset commits 
will generally result in surprises and " +
           s"should be avoided.")
     }
 
-    val isTxnOffsetCommit = producerId != RecordBatch.NO_PRODUCER_ID
-    // construct the message set to append
+    val filteredOffsetMetadata = offsetMetadata.filter { case (_, 
offsetAndMetadata) =>
+      validateOffsetMetadataLength(offsetAndMetadata.metadata)
+    }
     if (filteredOffsetMetadata.isEmpty) {
       // compute the final error codes for the commit response
       val commitStatus = offsetMetadata.map { case (k, _) => k -> 
Errors.OFFSET_METADATA_TOO_LARGE }
       responseCallback(commitStatus)
-    } else {
-      getMagic(partitionFor(group.groupId)) match {
-        case Some(magicValue) =>
-          // We always use CREATE_TIME, like the producer. The conversion to 
LOG_APPEND_TIME (if necessary) happens automatically.
-          val timestampType = TimestampType.CREATE_TIME
-          val timestamp = time.milliseconds()
-
-          val records = filteredOffsetMetadata.map { case (topicIdPartition, 
offsetAndMetadata) =>
-            val key = GroupMetadataManager.offsetCommitKey(group.groupId, 
topicIdPartition.topicPartition)
-            val value = 
GroupMetadataManager.offsetCommitValue(offsetAndMetadata, 
interBrokerProtocolVersion)
-            new SimpleRecord(timestamp, key, value)
-          }
-          val offsetTopicPartition = new 
TopicPartition(Topic.GROUP_METADATA_TOPIC_NAME, partitionFor(group.groupId))
-          val buffer = 
ByteBuffer.allocate(AbstractRecords.estimateSizeInBytes(magicValue, 
compressionType, records.asJava))
-
-          if (isTxnOffsetCommit && magicValue < RecordBatch.MAGIC_VALUE_V2)
-            throw Errors.UNSUPPORTED_FOR_MESSAGE_FORMAT.exception("Attempting 
to make a transaction offset commit with an invalid magic: " + magicValue)
-
-          val builder = MemoryRecords.builder(buffer, magicValue, 
compressionType, timestampType, 0L, time.milliseconds(),
-            producerId, producerEpoch, 0, isTxnOffsetCommit, 
RecordBatch.NO_PARTITION_LEADER_EPOCH)
-
-          records.foreach(builder.append)
-          val entries = Map(offsetTopicPartition -> builder.build())
-
-          // set the callback function to insert offsets into cache after log 
append completed
-          def putCacheCallback(responseStatus: Map[TopicPartition, 
PartitionResponse]): Unit = {
-            // the append response should only contain the topics partition
-            if (responseStatus.size != 1 || 
!responseStatus.contains(offsetTopicPartition))
-              throw new IllegalStateException("Append status %s should only 
have one partition %s"
-                .format(responseStatus, offsetTopicPartition))
-
-            // construct the commit response status and insert
-            // the offset and metadata to cache if the append status has no 
error
-            val status = responseStatus(offsetTopicPartition)
-
-            val responseError = group.inLock {
-              if (status.error == Errors.NONE) {
-                if (!group.is(Dead)) {
-                  filteredOffsetMetadata.forKeyValue { (topicIdPartition, 
offsetAndMetadata) =>
-                    if (isTxnOffsetCommit)
-                      group.onTxnOffsetCommitAppend(producerId, 
topicIdPartition, CommitRecordMetadataAndOffset(Some(status.baseOffset), 
offsetAndMetadata))
-                    else
-                      group.onOffsetCommitAppend(topicIdPartition, 
CommitRecordMetadataAndOffset(Some(status.baseOffset), offsetAndMetadata))
-                  }
-                }
-
-                // Record the number of offsets committed to the log
-                offsetCommitsSensor.record(records.size)
-
-                Errors.NONE
-              } else {
-                if (!group.is(Dead)) {
-                  if (!group.hasPendingOffsetCommitsFromProducer(producerId))
-                    removeProducerGroup(producerId, group.groupId)
-                  filteredOffsetMetadata.forKeyValue { (topicIdPartition, 
offsetAndMetadata) =>
-                    if (isTxnOffsetCommit)
-                      group.failPendingTxnOffsetCommit(producerId, 
topicIdPartition)
-                    else
-                      group.failPendingOffsetWrite(topicIdPartition, 
offsetAndMetadata)
-                  }
-                }
-
-                debug(s"Offset commit $filteredOffsetMetadata from group 
${group.groupId}, consumer $consumerId " +
-                  s"with generation ${group.generationId} failed when 
appending to log due to ${status.error.exceptionName}")
-
-                // transform the log append error code to the corresponding 
the commit status error code
-                status.error match {
-                  case Errors.UNKNOWN_TOPIC_OR_PARTITION
-                       | Errors.NOT_ENOUGH_REPLICAS
-                       | Errors.NOT_ENOUGH_REPLICAS_AFTER_APPEND =>
-                    Errors.COORDINATOR_NOT_AVAILABLE
-
-                  case Errors.NOT_LEADER_OR_FOLLOWER
-                       | Errors.KAFKA_STORAGE_ERROR =>
-                    Errors.NOT_COORDINATOR
-
-                  case Errors.MESSAGE_TOO_LARGE
-                       | Errors.RECORD_LIST_TOO_LARGE
-                       | Errors.INVALID_FETCH_SIZE =>
-                    Errors.INVALID_COMMIT_OFFSET_SIZE
+      return
+    }
 
-                  case other => other
-                }
-              }
-            }
+    val magicOpt = getMagic(partitionFor(group.groupId))
+    if (magicOpt.isEmpty) {
+      val commitStatus = offsetMetadata.map { case (topicIdPartition, _) =>
+        (topicIdPartition, Errors.NOT_COORDINATOR)
+      }
+      responseCallback(commitStatus)
+      return
+    }
 
-            // compute the final error codes for the commit response
-            val commitStatus = offsetMetadata.map { case (topicIdPartition, 
offsetAndMetadata) =>
-              if (validateOffsetMetadataLength(offsetAndMetadata.metadata))
-                (topicIdPartition, responseError)
-              else
-                (topicIdPartition, Errors.OFFSET_METADATA_TOO_LARGE)
-            }
+    val isTxnOffsetCommit = producerId != RecordBatch.NO_PRODUCER_ID
+    val records = generateOffsetRecords(magicOpt.get, isTxnOffsetCommit, 
group.groupId, filteredOffsetMetadata, producerId, producerEpoch)
+    val putCacheCallback = createPutCacheCallback(isTxnOffsetCommit, group, 
consumerId, offsetMetadata, filteredOffsetMetadata, responseCallback, 
producerId, records)
 
-            // finally trigger the callback logic passed from the API layer
-            responseCallback(commitStatus)
-          }
+    group.inLock {
+        group.prepareOffsetCommit(offsetMetadata)
+    }
 
-          if (isTxnOffsetCommit) {
-            group.inLock {
-              addProducerGroup(producerId, group.groupId)
-              group.prepareTxnOffsetCommit(producerId, offsetMetadata)
-            }
-          } else {
-            group.inLock {
-              group.prepareOffsetCommit(offsetMetadata)
-            }
-          }
+    appendForGroup(group, records, requestLocal, putCacheCallback)
+  }
 
-          appendForGroup(group, entries, requestLocal, putCacheCallback, 
transactionalId)
+  def storeOffsetsAfterVerification(group: GroupMetadata,
+                                    verifiedOffsetMetadata: 
immutable.Map[TopicIdPartition, OffsetAndMetadata],
+                                    records: Map[TopicPartition, 
MemoryRecords],
+                                    putCacheCallback:  Map[TopicPartition, 
PartitionResponse] => Unit,
+                                    producerId: Long,
+                                    transactionVerificationEntries: 
TransactionVerificationEntries,
+                                    errorResults: Map[TopicPartition, 
LogAppendResult],
+                                    requestLocal: RequestLocal = 
RequestLocal.NoCaching): Unit = {
+    group.inLock {

Review Comment:
   Wouldn't this whole function have to be called under group lock in order for 
append to happen partially under the lock (sort of the reason for this change)? 
 If so let's add a comment that explains the precondition and remove extra lock 
taking.  Re-taking the locks when not necessary makes it hard to track lock 
scopes.



##########
core/src/main/scala/kafka/server/ReplicaManager.scala:
##########
@@ -941,39 +857,129 @@ class ReplicaManager(val config: KafkaConfig,
     }
   }
 
-  private def partitionEntriesForVerification(verificationGuards: 
mutable.Map[TopicPartition, VerificationGuard],
-                                              entriesPerPartition: 
Map[TopicPartition, MemoryRecords],
-                                              verifiedEntries: 
mutable.Map[TopicPartition, MemoryRecords],
-                                              unverifiedEntries: 
mutable.Map[TopicPartition, MemoryRecords],
-                                              errorEntries: 
mutable.Map[TopicPartition, Errors]): Unit= {
+  private def sendInvalidRequiredAcksResponse(entries: Map[TopicPartition, 
MemoryRecords],
+                                             responseCallback: 
Map[TopicPartition, PartitionResponse] => Unit): Unit = {
+    // If required.acks is outside accepted range, something is wrong with the 
client
+    // Just return an error and don't handle the request at all
+    val responseStatus = entries.map { case (topicPartition, _) =>
+      topicPartition -> new PartitionResponse(
+        Errors.INVALID_REQUIRED_ACKS,
+        LogAppendInfo.UNKNOWN_LOG_APPEND_INFO.firstOffset,
+        RecordBatch.NO_TIMESTAMP,
+        LogAppendInfo.UNKNOWN_LOG_APPEND_INFO.logStartOffset
+      )
+    }
+    responseCallback(responseStatus)
+  }
+
+  /**
+   * Apply the postVerificationCallback asynchronously only after verifying 
the partitions have been added to the transaction.
+   * The postVerificationCallback takes the arguments of the requestLocal for 
the thread that will be doing the append as
+   * well as a mapping of topic partitions to LogAppendResult for the 
partitions that saw errors when verifying.
+   *
+   * This method will start the verification process for all the 
topicPartitions in entriesPerPartition and supply the
+   * postVerificationCallback to be run on a request handler thread when the 
response is received.
+   *
+   * @param entriesPerPartition            the records per partition to be 
appended and therefore need verification
+   * @param transactionVerificationEntries the object that will store the 
entries to verify, the errors, and the verification guards
+   * @param transactionalId                the id for the transaction
+   * @param requestLocal                   container for the stateful 
instances scoped to this request -- this must correspond to the
+   *                                       thread calling this method
+   * @param postVerificationCallback       the method to be called when 
verification completes and the verification errors
+   *                                       and the thread's RequestLocal are 
supplied
+   */
+  def appendRecordsWithTransactionVerification(entriesPerPartition: 
Map[TopicPartition, MemoryRecords],
+                                               transactionVerificationEntries: 
TransactionVerificationEntries,
+                                               transactionalId: String,
+                                               requestLocal: RequestLocal,
+                                               postVerificationCallback: 
RequestLocal => Map[TopicPartition, LogAppendResult] => Unit): Unit = {
+    if (transactionalId != null && 
config.transactionPartitionVerificationEnable && 
addPartitionsToTxnManager.isDefined)
+      partitionEntriesForVerification(transactionVerificationEntries, 
entriesPerPartition)
+
+    val onVerificationComplete: (RequestLocal, Map[TopicPartition, Errors]) => 
Unit =
+      executePostVerificationCallback(
+        transactionVerificationEntries,
+        postVerificationCallback,
+      )
+
+    if (transactionVerificationEntries.unverified.isEmpty) {
+      onVerificationComplete(requestLocal, 
transactionVerificationEntries.errors.toMap)
+    } else {
+      // For unverified entries, send a request to verify. When verified, the 
append process will proceed via the callback.
+      // We verify above that all partitions use the same producer ID.
+      val batchInfo = 
transactionVerificationEntries.unverified.head._2.firstBatch()
+      addPartitionsToTxnManager.foreach(_.verifyTransaction(
+        transactionalId = transactionalId,
+        producerId = batchInfo.producerId,
+        producerEpoch = batchInfo.producerEpoch,
+        topicPartitions = 
transactionVerificationEntries.unverified.keySet.toSeq,
+        callback = 
KafkaRequestHandler.wrapAsyncCallback(onVerificationComplete, requestLocal)
+      ))
+    }
+  }
+
+  /**
+   * A helper method to compile the results from the transaction verification 
and call the postVerificationCallback.
+   *
+   * @param transactionVerificationEntries the object that will store the 
entries to verify, the errors, and the verification guards
+   * @param postVerificationCallback       the method to be called when 
verification completes and the verification errors
+   *                                       and the thread's RequestLocal are 
supplied
+   * @param requestLocal                   container for the stateful 
instances scoped to this request -- this must correspond to the
+   *                                       thread calling this method
+   *
+   */
+  private def executePostVerificationCallback(transactionVerificationEntries: 
TransactionVerificationEntries,
+                                                      
postVerificationCallback: RequestLocal => Map[TopicPartition, LogAppendResult] 
=> Unit)
+                                                     (requestLocal: 
RequestLocal, unverifiedEntries: Map[TopicPartition, Errors]): Unit = {
+    val errorResults = (unverifiedEntries ++ 
transactionVerificationEntries.errors).map {
+      case (topicPartition, error) =>
+        // translate transaction coordinator errors to known producer response 
errors
+        val customException =
+          error match {
+            case Errors.INVALID_TXN_STATE => Some(error.exception("Partition 
was not added to the transaction"))
+            case Errors.CONCURRENT_TRANSACTIONS |
+                 Errors.COORDINATOR_LOAD_IN_PROGRESS |
+                 Errors.COORDINATOR_NOT_AVAILABLE |
+                 Errors.NOT_COORDINATOR => Some(new NotEnoughReplicasException(
+              s"Unable to verify the partition has been added to the 
transaction. Underlying error: ${error.toString}"))
+            case _ => None
+          }
+        topicPartition -> LogAppendResult(
+          LogAppendInfo.UNKNOWN_LOG_APPEND_INFO,
+          Some(customException.getOrElse(error.exception)),
+          hasCustomErrorMessage = customException.isDefined
+        )
+    }
+    postVerificationCallback(requestLocal)(errorResults)
+  }
+
+  private def partitionEntriesForVerification(transactionVerificationEntries: 
TransactionVerificationEntries, entriesPerPartition: Map[TopicPartition, 
MemoryRecords]): TransactionVerificationEntries = {

Review Comment:
   Do we use the result of this function?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: jira-unsubscr...@kafka.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] KAFKA-15784: Ensure atomicity of in memory update and write when transactionally committing offsets [kafka]

Reply via email to