nsivabalan commented on code in PR #7580: URL: https://github.com/apache/hudi/pull/7580#discussion_r1066520614
########## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java: ########## @@ -514,24 +515,27 @@ private Stream<HoodieInstant> getInstantsToArchive() throws IOException { .setBasePath(HoodieTableMetadata.getDatasetBasePath(config.getBasePath())) .setConf(metaClient.getHadoopConf()) .build(); - Option<HoodieInstant> earliestActiveDatasetCommit = dataMetaClient.getActiveTimeline().firstInstant(); - - if (config.shouldArchiveBeyondSavepoint()) { - // There are chances that there could be holes in the timeline due to archival and savepoint interplay. - // So, the first non-savepoint commit in the data timeline is considered as beginning of the active timeline. - Option<HoodieInstant> firstNonSavepointCommit = dataMetaClient.getActiveTimeline().getFirstNonSavepointCommit(); - if (firstNonSavepointCommit.isPresent()) { - String firstNonSavepointCommitTime = firstNonSavepointCommit.get().getTimestamp(); - instants = instants.filter(instant -> - compareTimestamps(instant.getTimestamp(), LESSER_THAN, firstNonSavepointCommitTime)); - } - } else { - // Do not archive the commits that live in data set active timeline. - // This is required by metadata table, see HoodieTableMetadataUtil#processRollbackMetadata for details. - if (earliestActiveDatasetCommit.isPresent()) { - instants = instants.filter(instant -> - compareTimestamps(instant.getTimestamp(), HoodieTimeline.LESSER_THAN, earliestActiveDatasetCommit.get().getTimestamp())); - } + Option<HoodieInstant> qualifiedEarliestInstant = + TimelineUtils.getEarliestInstantForMetadataArchival( + dataMetaClient.getActiveTimeline(), config.shouldArchiveBeyondSavepoint()); + + // Do not archive the instants after the earliest commit (COMMIT, DELTA_COMMIT, and + // REPLACE_COMMIT only, considering non-savepoint commit only if enabling archive + // beyond savepoint) and the earliest inflight instant (all actions). + // This is required by metadata table, see HoodieTableMetadataUtil#processRollbackMetadata + // for details. + // Note that we cannot blindly use the earliest instant of all actions, because CLEAN and + // ROLLBACK instants are archived separately apart from commits (check + // HoodieTimelineArchiver#getCleanInstantsToArchive). If we do so, a very old completed + // CLEAN or ROLLBACK instant can block the archive of metadata table timeline and causes + // the active timeline of metadata table to be extremely long, leading to performance issues + // for loading the timeline. + if (qualifiedEarliestInstant.isPresent()) { + instants = instants.filter(instant -> + compareTimestamps( Review Comment: I have created a jira to test savepoint beyond archival. I guess, we should not be deleting the commits that are savepointed. https://issues.apache.org/jira/browse/HUDI-5525 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org