jsancio commented on code in PR #19762: URL: https://github.com/apache/kafka/pull/19762#discussion_r2190478910
########## raft/src/main/java/org/apache/kafka/raft/KafkaRaftLog.java: ########## @@ -0,0 +1,828 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.raft; + +import org.apache.kafka.common.KafkaException; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.Uuid; +import org.apache.kafka.common.config.TopicConfig; +import org.apache.kafka.common.errors.CorruptRecordException; +import org.apache.kafka.common.errors.InvalidConfigurationException; +import org.apache.kafka.common.record.MemoryRecords; +import org.apache.kafka.common.record.Records; +import org.apache.kafka.common.utils.LogContext; +import org.apache.kafka.common.utils.Time; +import org.apache.kafka.common.utils.Utils; +import org.apache.kafka.server.common.OffsetAndEpoch; +import org.apache.kafka.server.config.ServerLogConfigs; +import org.apache.kafka.server.storage.log.FetchIsolation; +import org.apache.kafka.server.util.Scheduler; +import org.apache.kafka.snapshot.FileRawSnapshotReader; +import org.apache.kafka.snapshot.FileRawSnapshotWriter; +import org.apache.kafka.snapshot.NotifyingRawSnapshotWriter; +import org.apache.kafka.snapshot.RawSnapshotReader; +import org.apache.kafka.snapshot.RawSnapshotWriter; +import org.apache.kafka.snapshot.SnapshotPath; +import org.apache.kafka.snapshot.Snapshots; +import org.apache.kafka.storage.internals.log.AppendOrigin; +import org.apache.kafka.storage.internals.log.FetchDataInfo; +import org.apache.kafka.storage.internals.log.LogConfig; +import org.apache.kafka.storage.internals.log.LogDirFailureChannel; +import org.apache.kafka.storage.internals.log.LogStartOffsetIncrementReason; +import org.apache.kafka.storage.internals.log.ProducerStateManagerConfig; +import org.apache.kafka.storage.internals.log.UnifiedLog; +import org.apache.kafka.storage.log.metrics.BrokerTopicStats; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.file.DirectoryStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Optional; +import java.util.Properties; +import java.util.TreeMap; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Function; + +public class KafkaRaftLog implements RaftLog { + + private static final Logger LOG = LoggerFactory.getLogger(KafkaRaftLog.class); + + private final Logger logger; + private final UnifiedLog log; + private final Time time; + private final Scheduler scheduler; + // Access to this object needs to be synchronized because it is used by the snapshotting thread to notify the + // polling thread when snapshots are created. This object is also used to store any opened snapshot reader. + private final TreeMap<OffsetAndEpoch, Optional<FileRawSnapshotReader>> snapshots; + private final TopicPartition topicPartition; + private final MetadataLogConfig config; + private final String logIdent; + + public static KafkaRaftLog createLog( + TopicPartition topicPartition, + Uuid topicId, + File dataDir, + Time time, + Scheduler scheduler, + MetadataLogConfig config, + int nodeId) throws IOException { + Properties props = new Properties(); + props.setProperty(TopicConfig.MAX_MESSAGE_BYTES_CONFIG, String.valueOf(config.internalMaxBatchSizeInBytes())); + if (config.internalSegmentBytes() != null) { + props.setProperty(LogConfig.INTERNAL_SEGMENT_BYTES_CONFIG, String.valueOf(config.internalSegmentBytes())); + } else { + props.setProperty(TopicConfig.SEGMENT_BYTES_CONFIG, String.valueOf(config.logSegmentBytes())); + } + props.setProperty(TopicConfig.FILE_DELETE_DELAY_MS_CONFIG, String.valueOf(ServerLogConfigs.LOG_DELETE_DELAY_MS_DEFAULT)); + + // Disable time and byte retention when deleting segments + props.setProperty(TopicConfig.RETENTION_MS_CONFIG, "-1"); + props.setProperty(TopicConfig.RETENTION_BYTES_CONFIG, "-1"); + LogConfig.validate(props); + LogConfig defaultLogConfig = new LogConfig(props); + + if (defaultLogConfig.retentionMs >= 0) { + throw new InvalidConfigurationException( + "Cannot set " + TopicConfig.RETENTION_MS_CONFIG + " above -1: " + defaultLogConfig.retentionMs + ); + } else if (defaultLogConfig.retentionSize >= 0) { + throw new InvalidConfigurationException( + "Cannot set " + TopicConfig.RETENTION_BYTES_CONFIG + " above -1: " + defaultLogConfig.retentionSize + ); + } + + UnifiedLog log = UnifiedLog.create( + dataDir, + defaultLogConfig, + 0L, + 0L, + scheduler, + new BrokerTopicStats(), + time, + Integer.MAX_VALUE, + new ProducerStateManagerConfig(Integer.MAX_VALUE, false), + Integer.MAX_VALUE, + new LogDirFailureChannel(5), + false, + Optional.of(topicId) + ); + + KafkaRaftLog metadataLog = new KafkaRaftLog( + log, + time, + scheduler, + recoverSnapshots(log), + topicPartition, + config, + nodeId + ); + + // When recovering, truncate fully if the latest snapshot is after the log end offset. This can happen to a follower + // when the follower crashes after downloading a snapshot from the leader but before it could truncate the log fully. + metadataLog.truncateToLatestSnapshot(); + + return metadataLog; + } + + public KafkaRaftLog( + UnifiedLog log, + Time time, + Scheduler scheduler, + // Access to this object needs to be synchronized because it is used by the snapshotting thread to notify the + // polling thread when snapshots are created. This object is also used to store any opened snapshot reader. + TreeMap<OffsetAndEpoch, Optional<FileRawSnapshotReader>> snapshots, + TopicPartition topicPartition, + MetadataLogConfig config, + int nodeId) { + this.log = log; + this.time = time; + this.scheduler = scheduler; + this.snapshots = snapshots; + this.topicPartition = topicPartition; + this.config = config; + this.logIdent = "[MetadataLog partition=" + topicPartition + ", nodeId=" + nodeId + "] "; + this.logger = new LogContext(logIdent).logger(KafkaRaftLog.class); + } + + // for testing + UnifiedLog log() { + return log; + } + + @Override + public LogFetchInfo read(long startOffset, Isolation readIsolation) { + FetchIsolation isolation = switch (readIsolation) { + case COMMITTED -> FetchIsolation.HIGH_WATERMARK; + case UNCOMMITTED -> FetchIsolation.LOG_END; + }; + + try { + FetchDataInfo fetchInfo = log.read(startOffset, config.internalMaxFetchSizeInBytes(), isolation, true); + return new LogFetchInfo( + fetchInfo.records, + new LogOffsetMetadata( + fetchInfo.fetchOffsetMetadata.messageOffset, + Optional.of(new SegmentPosition( + fetchInfo.fetchOffsetMetadata.segmentBaseOffset, + fetchInfo.fetchOffsetMetadata.relativePositionInSegment)) + ) + ); + } catch (IOException ioe) { + throw new UncheckedIOException(ioe); + } + } + + @Override + public LogAppendInfo appendAsLeader(Records records, int epoch) { + if (records.sizeInBytes() == 0) { + throw new IllegalArgumentException("Attempt to append an empty record set"); + } + + try { + return handleAndConvertLogAppendInfo(log.appendAsLeader((MemoryRecords) records, epoch, AppendOrigin.RAFT_LEADER)); + } catch (IOException ioe) { + throw new UncheckedIOException(ioe); + } + } + + @Override + public LogAppendInfo appendAsFollower(Records records, int epoch) { + if (records.sizeInBytes() == 0) { + throw new IllegalArgumentException("Attempt to append an empty record set"); + } + + return handleAndConvertLogAppendInfo(log.appendAsFollower((MemoryRecords) records, epoch)); + } + + private LogAppendInfo handleAndConvertLogAppendInfo(org.apache.kafka.storage.internals.log.LogAppendInfo appendInfo) { + if (appendInfo.firstOffset() == UnifiedLog.UNKNOWN_OFFSET) { + throw new CorruptRecordException("Append failed unexpectedly " + appendInfo); + } else { + return new LogAppendInfo(appendInfo.firstOffset(), appendInfo.lastOffset()); + } + } + + @Override + public int lastFetchedEpoch() { + Optional<Integer> latestEpoch = log.latestEpoch(); + return latestEpoch.orElseGet(() -> latestSnapshotId().map(snapshotId -> { + long logEndOffset = endOffset().offset(); + long startOffset = startOffset(); + if (snapshotId.offset() == startOffset && snapshotId.offset() == logEndOffset) { + // Return the epoch of the snapshot when the log is empty + return snapshotId.epoch(); + } else { + throw new KafkaException( + "Log doesn't have a last fetch epoch and there is a snapshot (" + snapshotId + "). " + + "Expected the snapshot's end offset to match the log's end offset (" + logEndOffset + + ") and the log start offset (" + startOffset + ")" + ); + } + }).orElse(0)); + } + + @Override + public OffsetAndEpoch endOffsetForEpoch(int epoch) { + Optional<OffsetAndEpoch> endOffsetEpochOpt = log.endOffsetForEpoch(epoch); + Optional<OffsetAndEpoch> earliestSnapshotIdOpt = earliestSnapshotId(); + if (endOffsetEpochOpt.isPresent()) { + OffsetAndEpoch endOffsetEpoch = endOffsetEpochOpt.get(); + if (earliestSnapshotIdOpt.isPresent()) { + OffsetAndEpoch earliestSnapshotId = earliestSnapshotIdOpt.get(); + if (endOffsetEpoch.offset() == earliestSnapshotId.offset() && endOffsetEpoch.epoch() == epoch) { + // The epoch is smaller than the smallest epoch on the log. Override the diverging + // epoch to the oldest snapshot which should be the snapshot at the log start offset Review Comment: The equality is between `endOffsetEpoch.epoch()` and `epoch` (the requested epoch). The leader epoch cache returns the request epoch and the log start offset if the requested epoch is less than the latest known epoch in the log. This is probably the most common case for this: ``` Kafka Replicated Log: LogStartOffset -- high-watermark -- LEO -- V V V ----------------------------------------------- offset: | x | ... | y - 1 | y | ... | | ... | | epoch: | b | ... | c | d | ... | | ... | | ----------------------------------------------- Kafka Snapshot Files: <topic_name>-<partition_index>/x-a.checkpoint <topic_name>-<partition_index>/y-c.checkpoint ``` The requested epoch is **e** which is less than **b**, the unified log's leader epoch cache will return **(e, x)** which may not be an accurate epoch and end offset. In kraft, we instead return **(a, x)** (the checkpoint at that end offset) which is an accurate epoch and end offset. Note that in Kraft, there is always an checkpoint at the log start offset. That is a kraft invariant. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: jira-unsubscr...@kafka.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org