showuon commented on code in PR #15616: URL: https://github.com/apache/kafka/pull/15616#discussion_r1562184559
########## storage/src/main/java/org/apache/kafka/storage/internals/log/LogSegment.java: ########## @@ -800,8 +802,23 @@ private Void deleteTypeIfExists(StorageAction<Boolean, IOException> delete, Stri try { if (delete.execute()) LOGGER.info("Deleted {} {}.", fileType, file.getAbsolutePath()); - else if (logIfMissing) - LOGGER.info("Failed to delete {} {} because it does not exist.", fileType, file.getAbsolutePath()); + else { + if (logIfMissing) { + LOGGER.info("Failed to delete {} {} because it does not exist.", fileType, file.getAbsolutePath()); + } + + // During alter log dir, the log segment may be moved to a new directory, so async delete may fail. + // Fallback to delete the file in the new directory to avoid orphan file. + Pattern dirPattern = Pattern.compile("^(\\S+)-(\\S+)\\.(\\S+)-(delete|future)"); + Matcher dirMatcher = dirPattern.matcher(file.getParent()); + if (dirMatcher.matches()) { + String topicPartitionAbsolutePath = dirMatcher.group(1) + "-" + dirMatcher.group(2); + File fallbackFile = new File(topicPartitionAbsolutePath, file.getName()); + if (fallbackFile.exists() && fallbackFile.delete()) { Review Comment: Does the file name always ends with `.delete`? Should we check it before deletion? ########## storage/src/main/java/org/apache/kafka/storage/internals/log/LogSegment.java: ########## @@ -800,8 +802,23 @@ private Void deleteTypeIfExists(StorageAction<Boolean, IOException> delete, Stri try { if (delete.execute()) LOGGER.info("Deleted {} {}.", fileType, file.getAbsolutePath()); - else if (logIfMissing) - LOGGER.info("Failed to delete {} {} because it does not exist.", fileType, file.getAbsolutePath()); + else { + if (logIfMissing) { + LOGGER.info("Failed to delete {} {} because it does not exist.", fileType, file.getAbsolutePath()); + } + + // During alter log dir, the log segment may be moved to a new directory, so async delete may fail. + // Fallback to delete the file in the new directory to avoid orphan file. + Pattern dirPattern = Pattern.compile("^(\\S+)-(\\S+)\\.(\\S+)-(delete|future)"); Review Comment: 1. Why does it contain `delete` in the end? 2. Unfortunately, the topic name could contain `-` or `.`, so it's unsafe to do regex like this. I'm thinking we can pass `topicPartition` as parameter into `deleteTypeIfExists` so that we don't have to do further regex like this. And just verify if fileName.endsWith("future") because the normal folder name should always ends with a number (partition number), instead of "future". WDYT? ########## storage/src/main/java/org/apache/kafka/storage/internals/log/LogSegment.java: ########## @@ -800,8 +802,23 @@ private Void deleteTypeIfExists(StorageAction<Boolean, IOException> delete, Stri try { if (delete.execute()) LOGGER.info("Deleted {} {}.", fileType, file.getAbsolutePath()); - else if (logIfMissing) - LOGGER.info("Failed to delete {} {} because it does not exist.", fileType, file.getAbsolutePath()); + else { + if (logIfMissing) { + LOGGER.info("Failed to delete {} {} because it does not exist.", fileType, file.getAbsolutePath()); + } + + // During alter log dir, the log segment may be moved to a new directory, so async delete may fail. + // Fallback to delete the file in the new directory to avoid orphan file. + Pattern dirPattern = Pattern.compile("^(\\S+)-(\\S+)\\.(\\S+)-(delete|future)"); + Matcher dirMatcher = dirPattern.matcher(file.getParent()); + if (dirMatcher.matches()) { + String topicPartitionAbsolutePath = dirMatcher.group(1) + "-" + dirMatcher.group(2); + File fallbackFile = new File(topicPartitionAbsolutePath, file.getName()); + if (fallbackFile.exists() && fallbackFile.delete()) { + LOGGER.warn("Fallback to delete {} {}.", fileType, fallbackFile.getAbsolutePath()); Review Comment: Why did we use `warn` here? I think we can use `info` since it's expected behavior. WDYT? ########## core/src/test/scala/unit/kafka/server/AlterReplicaLogDirsRequestTest.scala: ########## @@ -116,6 +118,57 @@ class AlterReplicaLogDirsRequestTest extends BaseRequestTest { assertEquals(Errors.KAFKA_STORAGE_ERROR, findErrorForPartition(alterReplicaDirResponse3, new TopicPartition(topic, 2))) } + @Test + def testAlterReplicaLogDirsRequestWithRetention(): Unit = { + val partitionNum = 1 + + // Alter replica dir before topic creation + val logDir1 = new File(servers.head.config.logDirs(1)).getAbsolutePath + val partitionDirs1 = (0 until partitionNum).map(partition => new TopicPartition(topic, partition) -> logDir1).toMap + val alterReplicaLogDirsResponse1 = sendAlterReplicaLogDirsRequest(partitionDirs1) + + // The response should show error UNKNOWN_TOPIC_OR_PARTITION for all partitions + val tp = new TopicPartition(topic, 0) + assertEquals(Errors.UNKNOWN_TOPIC_OR_PARTITION, findErrorForPartition(alterReplicaLogDirsResponse1, tp)) + assertTrue(servers.head.logManager.getLog(tp).isEmpty) + + val topicProperties = new Properties() + topicProperties.put(TopicConfig.RETENTION_BYTES_CONFIG, "1024") + topicProperties.put(TopicConfig.FILE_DELETE_DELAY_MS_CONFIG, "10000") + topicProperties.put(TopicConfig.SEGMENT_BYTES_CONFIG, "1024") + + createTopic(topic, partitionNum, 1, topicProperties) + assertEquals(logDir1, servers.head.logManager.getLog(tp).get.dir.getParent) + + // send enough records to trigger log rolling + (0 until 20).foreach { _ => + TestUtils.generateAndProduceMessages(servers, topic, 10, 1) + } + TestUtils.waitUntilTrue(() => servers.head.logManager.getLog(new TopicPartition(topic, 0)).get.numberOfSegments > 1, + "timed out waiting for log segment to roll") + + // Wait for log segment retention. LogManager#InitialTaskDelayMs is 30 seconds. + // The first retention task is executed after 30 seconds, so waiting for 35 seconds should be enough. + TestUtils.waitUntilTrue(() => { + new File(logDir1, tp.toString).listFiles().count(_.getName.endsWith(LogFileUtils.DELETED_FILE_SUFFIX)) > 0 + }, "timed out waiting for log segment to retention", 35000) Review Comment: We should override the retention interval config (i.e. `log.retention.check.interval.ms`) to maybe 500 ms to speed it up, so that we don't need to increase the wait time. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: jira-unsubscr...@kafka.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org