danny0405 commented on code in PR #11440: URL: https://github.com/apache/hudi/pull/11440#discussion_r1674898261
########## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java: ########## @@ -553,13 +552,62 @@ public Pair<Boolean, List<CleanFileInfo>> getDeletePaths(String partitionPath, O * Returns the earliest commit to retain based on cleaning policy. */ public Option<HoodieInstant> getEarliestCommitToRetain() { - return CleanerUtils.getEarliestCommitToRetain( - hoodieTable.getMetaClient().getActiveTimeline().getCommitsAndCompactionTimeline(), - config.getCleanerPolicy(), - config.getCleanerCommitsRetained(), - Instant.now(), - config.getCleanerHoursRetained(), - hoodieTable.getMetaClient().getTableConfig().getTimelineTimezone()); + if (!earliestCommitToRetain.isPresent()) { + earliestCommitToRetain = CleanerUtils.getEarliestCommitToRetain( + hoodieTable.getMetaClient().getActiveTimeline().getCommitsAndCompactionTimeline(), + config.getCleanerPolicy(), + config.getCleanerCommitsRetained(), + Instant.now(), + config.getCleanerHoursRetained(), + hoodieTable.getMetaClient().getTableConfig().getTimelineTimezone()); + } + return earliestCommitToRetain; + } + + /** + * Returns earliest commit to not archive to assist with guarding archival. + * @return + */ + public Option<String> getEarliestCommitToNotArchive() { + // compute only if not set + if (!earliestCommitToNotArchiveOpt.isPresent() && config.getCleanerPolicy() != HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) { // earliest commit to retain and earliest commit to not archive + // makes sense only when clean policy is num commits based or hours based. + Option<HoodieInstant> earliestToRetain = getEarliestCommitToRetain(); + String earliestCommitToNotArchive = null; + if (!savepointedTimestamps.isEmpty()) { // if savepoints are found. + // find the first savepoint timestamp + String firstSavepoint = savepointedTimestamps.stream().sorted().findFirst().get(); + earliestCommitToNotArchive = firstSavepoint; + + // earliest commit to not archive = min (earliest commit to retain, earliest commit to not archive) + if (earliestToRetain.isPresent()) { + String earliestToRetainTimestamp = earliestToRetain.get().getTimestamp(); + // When earliestToRetainTimestamp is greater than first savepoint timestamp but there are no + // replace commits between the first savepoint and the earliestToRetainTimestamp, we can set the + // earliestCommitToNotArchive to earliestToRetainTimestamp + if (HoodieTimeline.compareTimestamps(earliestToRetainTimestamp, HoodieTimeline.GREATER_THAN, firstSavepoint)) { + HoodieTimeline replaceTimeline = hoodieTable.getActiveTimeline().getCompletedReplaceTimeline().findInstantsInClosedRange(firstSavepoint, earliestToRetainTimestamp); + earliestCommitToNotArchive = replaceTimeline.empty() ? earliestToRetainTimestamp : firstSavepoint; + LOG.info(String.format("Setting Earliest Commit to Not Archive to %s since replace timeline is empty, earliestCommitToRetain: %s, total list of savepointed timestamps : %s", + earliestCommitToNotArchive, earliestToRetain, Arrays.toString(savepointedTimestamps.toArray()))); + } else { + earliestCommitToNotArchive = earliestToRetainTimestamp; + LOG.info(String.format("Setting Earliest Commit to Not Archive to %s, earliestCommitToRetain: %s, total list of savepointed timestamps : %s", earliestCommitToNotArchive, + earliestToRetain, Arrays.toString(savepointedTimestamps.toArray()))); + } + } + } else { + // no savepoints found. + // lets set the value regardless. Would help us differentiate older versions of hudi where this will not be set vs latest version where this will be set to either earliest savepoint + // or the earliest commit to retain. + if (earliestToRetain.isPresent()) { + LOG.info(String.format("Setting Earliest Commit to Not Archive same as Earliest commit to retain to %s", earliestToRetain.get().getTimestamp())); + earliestCommitToNotArchive = earliestToRetain.get().getTimestamp(); + } + } + earliestCommitToNotArchiveOpt = Option.ofNullable(earliestCommitToNotArchive); Review Comment: Can we use a separate variable like `skippedReplacedSavepoints` apart from the `earliestToRetainTimestamp` ? And let the archiver to decide which one to use. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org