adixitconfluent commented on code in PR #19261:
URL: https://github.com/apache/kafka/pull/19261#discussion_r2013699195
##########
core/src/main/java/kafka/server/share/SharePartition.java:
##########
@@ -2484,6 +2497,259 @@ private long startOffsetDuringInitialization(long
partitionDataStartOffset) thro
}
}
+ private ShareAcquiredRecords
filterAbortedTransactionalAcquiredRecords(FetchPartitionData
fetchPartitionData, FetchIsolation isolationLevel, ShareAcquiredRecords
shareAcquiredRecords) {
+ if (isolationLevel != FetchIsolation.TXN_COMMITTED)
+ return shareAcquiredRecords;
+ // When FetchIsolation.TXN_COMMITTED is used as isolation type by the
share group, we need to filter any
+ // transactions that were aborted/did not commit due to timeout.
+ List<AcquiredRecords> result =
filterAbortedTransactionalRecords(fetchPartitionData.records.batches(),
shareAcquiredRecords.acquiredRecords(), fetchPartitionData.abortedTransactions);
+ int acquiredCount = 0;
+ for (AcquiredRecords records : result) {
+ acquiredCount += (int) (records.lastOffset() -
records.firstOffset() + 1);
+ }
+ return new ShareAcquiredRecords(result, acquiredCount);
+ }
+
+ private List<AcquiredRecords> filterAbortedTransactionalRecords(
+ Iterable<? extends RecordBatch> batches,
+ List<AcquiredRecords> acquiredRecords,
+ Optional<List<FetchResponseData.AbortedTransaction>>
abortedTransactions
+ ) {
+ lock.writeLock().lock();
+ try {
+ if (abortedTransactions.isEmpty())
+ return acquiredRecords;
+ // The record batches that need to be archived in cachedState
because they were a part of aborted transactions.
+ List<RecordBatch> recordsToArchive =
fetchAbortedTransactionRecordBatches(batches, abortedTransactions);
+ for (RecordBatch recordBatch : recordsToArchive) {
+ // Archive the offsets/batches in the cached state.
+ NavigableMap<Long, InFlightBatch> subMap =
fetchSubMap(recordBatch);
+ archiveAcquiredBatchRecords(subMap, recordBatch);
+ }
+ return filterRecordBatchesFromAcquiredRecords(acquiredRecords,
recordsToArchive);
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ // Visible for testing.
+ List<AcquiredRecords> filterRecordBatchesFromAcquiredRecords(
+ List<AcquiredRecords> acquiredRecords,
+ List<RecordBatch> recordsToArchive
+ ) {
+ lock.writeLock().lock();
+ try {
+ List<AcquiredRecords> result = new ArrayList<>();
+
+ for (AcquiredRecords acquiredRecord : acquiredRecords) {
+ List<AcquiredRecords> tempAcquiredRecords = new ArrayList<>();
+ tempAcquiredRecords.add(acquiredRecord);
+ for (RecordBatch recordBatch : recordsToArchive) {
+ List<AcquiredRecords> newAcquiredRecords = new
ArrayList<>();
+ for (AcquiredRecords temp : tempAcquiredRecords) {
+ // Check if record batch overlaps with the acquired
records.
+ if (temp.firstOffset() <= recordBatch.lastOffset() &&
temp.lastOffset() >= recordBatch.baseOffset()) {
+ // Split the acquired record into parts before,
inside, and after the overlapping record batch.
+ if (temp.firstOffset() < recordBatch.baseOffset())
{
+ newAcquiredRecords.add(new AcquiredRecords()
+ .setFirstOffset(temp.firstOffset())
+ .setLastOffset(recordBatch.baseOffset() -
1)
+ .setDeliveryCount((short) 1));
+ }
+ if (temp.lastOffset() > recordBatch.lastOffset()) {
+ newAcquiredRecords.add(new AcquiredRecords()
+ .setFirstOffset(recordBatch.lastOffset() +
1)
+ .setLastOffset(temp.lastOffset())
+ .setDeliveryCount((short) 1));
+ }
+ } else {
+ newAcquiredRecords.add(temp);
+ }
+ }
+ tempAcquiredRecords = newAcquiredRecords;
+ }
+ result.addAll(tempAcquiredRecords);
+ }
+ return result;
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ private void archiveAcquiredBatchRecords(NavigableMap<Long, InFlightBatch>
subMap, RecordBatch recordBatch) {
+ lock.writeLock().lock();
+ try {
+ // The fetched batch either is exact fetch equivalent batch
(mostly), subset
+ // or spans over multiple fetched batches. The state can vary per
offset itself from
+ // the fetched batch in case of subset.
+ for (Map.Entry<Long, InFlightBatch> entry : subMap.entrySet()) {
+ InFlightBatch inFlightBatch = entry.getValue();
+
+ // If startOffset has moved ahead of the in-flight batch, skip
the batch.
+ if (inFlightBatch.lastOffset() < startOffset) {
+ log.trace("All offsets in the inflight batch {} are
already archived: {}-{}",
+ inFlightBatch, groupId, topicIdPartition);
+ continue;
+ }
+
+ // Determine if the in-flight batch is a full match from the
request batch.
+ boolean fullMatch = checkForFullMatch(inFlightBatch,
recordBatch.baseOffset(), recordBatch.lastOffset());
+
+ // Maintain state per offset if the inflight batch is not a
full match or the
+ // offset state is managed for this in-flight batch.
+ if (!fullMatch || inFlightBatch.offsetState() != null) {
+ log.debug("Subset or offset tracked batch record found for
record,"
+ + " batch: {}, request offsets - first: {}, last:
{} for the share partition: {}-{}",
+ inFlightBatch, recordBatch.baseOffset(),
recordBatch.lastOffset(), groupId, topicIdPartition);
+ if (inFlightBatch.offsetState() == null) {
+ // The record batch is a subset and requires per
offset state hence initialize
+ // the offsets state in the in-flight batch.
+ inFlightBatch.maybeInitializeOffsetStateUpdate();
+ }
+ archivePerOffsetAcquiredBatchRecords(inFlightBatch,
recordBatch.baseOffset(), recordBatch.lastOffset());
+ continue;
+ }
+ // The in-flight batch is a full match hence change the state
of the complete batch.
+ archiveCompleteAcquiredBatch(inFlightBatch);
+ }
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ private void archivePerOffsetAcquiredBatchRecords(InFlightBatch
inFlightBatch, long startOffsetToArchive, long endOffsetToArchive) {
+ lock.writeLock().lock();
+ try {
+ log.trace("Archiving offset tracked batch: {} for the share
partition: {}-{} since it was a part of aborted transaction", inFlightBatch,
groupId, topicIdPartition);
+ for (Map.Entry<Long, InFlightState> offsetState :
inFlightBatch.offsetState().entrySet()) {
+ if (offsetState.getKey() < startOffsetToArchive) {
+ continue;
+ }
+ if (offsetState.getKey() > endOffsetToArchive) {
+ // No further offsets to process.
+ break;
+ }
+ if (offsetState.getValue().state != RecordState.ACQUIRED) {
+ continue;
+ }
+ offsetState.getValue().archive(EMPTY_MEMBER_ID);
+
offsetState.getValue().cancelAndClearAcquisitionLockTimeoutTask();
+ }
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ private void archiveCompleteAcquiredBatch(InFlightBatch inFlightBatch) {
+ lock.writeLock().lock();
+ try {
+ log.trace("Archiving complete batch: {} for the share partition:
{}-{} since it was a part of aborted transaction", inFlightBatch, groupId,
topicIdPartition);
+ if (inFlightBatch.batchState() == RecordState.ACQUIRED) {
+ // Change the state of complete batch since the same state
exists for the entire inFlight batch.
+ inFlightBatch.archiveBatch(EMPTY_MEMBER_ID);
+
inFlightBatch.batchState.cancelAndClearAcquisitionLockTimeoutTask();
+ }
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ private NavigableMap<Long, InFlightBatch> fetchSubMap(RecordBatch
recordBatch) {
+ lock.writeLock().lock();
+ try {
+ Map.Entry<Long, InFlightBatch> floorOffset =
cachedState.floorEntry(recordBatch.baseOffset());
+ if (floorOffset == null) {
+ log.debug("Fetched batch record {} not found for share
partition: {}-{}", recordBatch, groupId,
+ topicIdPartition);
+ throw new InvalidRecordStateException(
+ "Batch record not found. The request batch offsets are not
found in the cache.");
+ }
+ return cachedState.subMap(floorOffset.getKey(), true,
recordBatch.lastOffset(), true);
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ // Visible for testing.
+ List<RecordBatch> fetchAbortedTransactionRecordBatches(
+ Iterable<? extends RecordBatch> batches,
+ Optional<List<FetchResponseData.AbortedTransaction>>
abortedTransactions
+ ) {
+ lock.writeLock().lock();
+ try {
+ PriorityQueue<FetchResponseData.AbortedTransaction>
abortedTransactionsHeap = abortedTransactionsHeap(abortedTransactions.get());
+ Set<Long> abortedProducerIds = new HashSet<>();
+ List<RecordBatch> recordsToArchive = new ArrayList<>();
+
+ for (RecordBatch currentBatch : batches) {
+ if (currentBatch.hasProducerId()) {
+ // remove from the aborted transactions queue, all aborted
transactions which have begun
+ // before the current batch's last offset and add the
associated producerIds to the
+ // aborted producer set
+ if (abortedTransactionsHeap != null) {
+ while (!abortedTransactionsHeap.isEmpty() &&
abortedTransactionsHeap.peek().firstOffset() <= currentBatch.lastOffset()) {
+ FetchResponseData.AbortedTransaction
abortedTransaction = abortedTransactionsHeap.poll();
+
abortedProducerIds.add(abortedTransaction.producerId());
+ }
+ }
+ long producerId = currentBatch.producerId();
+ if (containsAbortMarker(currentBatch)) {
+ abortedProducerIds.remove(producerId);
+ } else if (isBatchAborted(currentBatch,
abortedProducerIds)) {
+ log.debug("Skipping aborted record batch for share
partition: {}-{} with producerId {} and " +
+ "offsets {} to {}", groupId, topicIdPartition,
producerId, currentBatch.baseOffset(), currentBatch.lastOffset());
+ recordsToArchive.add(currentBatch);
+ }
+ }
+ }
+ return recordsToArchive;
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ private PriorityQueue<FetchResponseData.AbortedTransaction>
abortedTransactionsHeap(List<FetchResponseData.AbortedTransaction>
abortedTransactions) {
+ lock.writeLock().lock();
+ try {
+ if (abortedTransactions == null || abortedTransactions.isEmpty())
+ return null;
+
+ PriorityQueue<FetchResponseData.AbortedTransaction>
abortedTransactionsHeap = new PriorityQueue<>(
+ abortedTransactions.size(),
Comparator.comparingLong(FetchResponseData.AbortedTransaction::firstOffset)
+ );
+ abortedTransactionsHeap.addAll(abortedTransactions);
+ return abortedTransactionsHeap;
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ private boolean isBatchAborted(RecordBatch batch, Set<Long>
abortedProducerIds) {
+ lock.writeLock().lock();
+ try {
+ return batch.isTransactional() &&
abortedProducerIds.contains(batch.producerId());
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ private boolean containsAbortMarker(RecordBatch batch) {
+ lock.writeLock().lock();
+ try {
+ if (!batch.isControlBatch())
+ return false;
+
+ Iterator<Record> batchIterator = batch.iterator();
+ if (!batchIterator.hasNext())
+ return false;
+
+ Record firstRecord = batchIterator.next();
+ return ControlRecordType.ABORT ==
ControlRecordType.parse(firstRecord.key());
Review Comment:
I have replicated the code what is present in the regular consumer client
for this case. When we do `batch.isControlBatch()`, we only read the batch
header and not the entire record and we are doing a check on `record.key()`
which involves reading the size of the key AFAIK. Not sure if there is a better
way to do it.
cc - @AndrewJSchofield
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]