Hangleton commented on code in PR #13535: URL: https://github.com/apache/kafka/pull/13535#discussion_r1166887367
########## core/src/main/java/kafka/log/remote/RemoteLogManager.java: ########## @@ -600,6 +622,176 @@ public String toString() { } } + public FetchDataInfo read(RemoteStorageFetchInfo remoteStorageFetchInfo) throws RemoteStorageException, IOException { + int fetchMaxBytes = remoteStorageFetchInfo.fetchMaxBytes; + TopicPartition tp = remoteStorageFetchInfo.topicPartition; + FetchRequest.PartitionData fetchInfo = remoteStorageFetchInfo.fetchInfo; + + boolean includeAbortedTxns = remoteStorageFetchInfo.fetchIsolation == FetchIsolation.TXN_COMMITTED; + + long offset = fetchInfo.fetchOffset; + int maxBytes = Math.min(fetchMaxBytes, fetchInfo.maxBytes); + + Optional<UnifiedLog> logOptional = fetchLog.apply(tp); + OptionalInt epoch = OptionalInt.empty(); + + if (logOptional.isPresent()) { + Option<LeaderEpochFileCache> leaderEpochCache = logOptional.get().leaderEpochCache(); + if (leaderEpochCache.isDefined()) { + epoch = leaderEpochCache.get().epochForOffset(offset); + } + } + + Optional<RemoteLogSegmentMetadata> rlsMetadata = epoch.isPresent() + ? fetchRemoteLogSegmentMetadata(tp, epoch.getAsInt(), offset) + : Optional.empty(); + + if (!rlsMetadata.isPresent()) { + String epochStr = (epoch.isPresent()) ? Integer.toString(epoch.getAsInt()) : "NOT AVAILABLE"; + throw new OffsetOutOfRangeException("Received request for offset " + offset + " for leader epoch " + + epochStr + " and partition " + tp + " which does not exist in remote tier."); + } + + int startPos = lookupPositionForOffset(rlsMetadata.get(), offset); + InputStream remoteSegInputStream = null; + try { + // Search forward for the position of the last offset that is greater than or equal to the target offset + remoteSegInputStream = remoteLogStorageManager.fetchLogSegment(rlsMetadata.get(), startPos); + RemoteLogInputStream remoteLogInputStream = new RemoteLogInputStream(remoteSegInputStream); + + RecordBatch firstBatch = findFirstBatch(remoteLogInputStream, offset); + + if (firstBatch == null) + return new FetchDataInfo(new LogOffsetMetadata(offset), MemoryRecords.EMPTY, false, + includeAbortedTxns ? Optional.of(Collections.emptyList()) : Optional.empty()); + + int updatedFetchSize = + remoteStorageFetchInfo.minOneMessage && firstBatch.sizeInBytes() > maxBytes + ? firstBatch.sizeInBytes() : maxBytes; + + ByteBuffer buffer = ByteBuffer.allocate(updatedFetchSize); + int remainingBytes = updatedFetchSize; + + firstBatch.writeTo(buffer); + remainingBytes -= firstBatch.sizeInBytes(); + + if (remainingBytes > 0) { + // input stream is read till (startPos - 1) while getting the batch of records earlier. + // read the input stream until min of (EOF stream or buffer's remaining capacity). + Utils.readFully(remoteSegInputStream, buffer); + } + buffer.flip(); + + FetchDataInfo fetchDataInfo = new FetchDataInfo(new LogOffsetMetadata(offset), MemoryRecords.readableRecords(buffer)); + if (includeAbortedTxns) { + fetchDataInfo = addAbortedTransactions(firstBatch.baseOffset(), rlsMetadata.get(), fetchDataInfo); + } + + return fetchDataInfo; + } finally { + Utils.closeQuietly(remoteSegInputStream, "RemoteLogSegmentInputStream"); + } + } + + private int lookupPositionForOffset(RemoteLogSegmentMetadata remoteLogSegmentMetadata, long offset) { + return indexCache.lookupOffset(remoteLogSegmentMetadata, offset); + } + + private FetchDataInfo addAbortedTransactions(long startOffset, + RemoteLogSegmentMetadata segmentMetadata, + FetchDataInfo fetchInfo) throws RemoteStorageException { + int fetchSize = fetchInfo.records.sizeInBytes(); + OffsetPosition startOffsetPosition = new OffsetPosition(fetchInfo.fetchOffsetMetadata.messageOffset, + fetchInfo.fetchOffsetMetadata.relativePositionInSegment); + + OffsetIndex offsetIndex = indexCache.getIndexEntry(segmentMetadata).offsetIndex(); + long upperBoundOffset = offsetIndex.fetchUpperBoundOffset(startOffsetPosition, fetchSize) + .map(x -> x.offset).orElse(segmentMetadata.endOffset() + 1); + + final List<FetchResponseData.AbortedTransaction> abortedTransactions = new ArrayList<>(); + + Consumer<List<AbortedTxn>> accumulator = + abortedTxns -> abortedTransactions.addAll(abortedTxns.stream() + .map(AbortedTxn::asAbortedTransaction).collect(Collectors.toList())); + + collectAbortedTransactions(startOffset, upperBoundOffset, segmentMetadata, accumulator); + + return new FetchDataInfo(fetchInfo.fetchOffsetMetadata, + fetchInfo.records, + fetchInfo.firstEntryIncomplete, + Optional.of(abortedTransactions)); + } + + private void collectAbortedTransactions(long startOffset, + long upperBoundOffset, + RemoteLogSegmentMetadata segmentMetadata, + Consumer<List<AbortedTxn>> accumulator) throws RemoteStorageException { + TopicPartition topicPartition = segmentMetadata.topicIdPartition().topicPartition(); + Iterator<LogSegment> localLogSegments = fetchLog.apply(topicPartition) + .map(log -> JavaConverters.asJavaCollection(log.logSegments())) + .map(Collection::iterator) + .orElse(Collections.emptyIterator()); + + boolean searchInLocalLog = false; + Optional<RemoteLogSegmentMetadata> nextSegmentMetadataOpt = Optional.of(segmentMetadata); + Optional<TransactionIndex> txnIndexOpt = nextSegmentMetadataOpt.map(metadata -> indexCache.getIndexEntry(metadata).txnIndex()); + + while (txnIndexOpt.isPresent()) { + TxnIndexSearchResult searchResult = txnIndexOpt.get().collectAbortedTxns(startOffset, upperBoundOffset); + accumulator.accept(searchResult.abortedTransactions); + if (!searchResult.isComplete) { + if (!searchInLocalLog) { + nextSegmentMetadataOpt = findNextSegmentMetadata(nextSegmentMetadataOpt.get()); + + txnIndexOpt = nextSegmentMetadataOpt.map(x -> indexCache.getIndexEntry(x).txnIndex()); Review Comment: There may be an availability risk here. The remote index cache allows up to 1,024 entries by default and it is possible to keep a transaction index entry permanently rotated in and out of the cache so that retries of Fetch requests for a partition at a given (constant) fetch offset continuously trigger a download of the index. This, plus the fact there is a global lock for read and write access on the remote index cache, can lead to lock contention on the remote log reader thread pool. In the worst case, it is possible for a partition to be prevented from making progress on the fetch path. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: jira-unsubscr...@kafka.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org