danny0405 commented on code in PR #11440:
URL: https://github.com/apache/hudi/pull/11440#discussion_r1674898261


##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java:
##########
@@ -553,13 +552,62 @@ public Pair<Boolean, List<CleanFileInfo>> 
getDeletePaths(String partitionPath, O
    * Returns the earliest commit to retain based on cleaning policy.
    */
   public Option<HoodieInstant> getEarliestCommitToRetain() {
-    return CleanerUtils.getEarliestCommitToRetain(
-        
hoodieTable.getMetaClient().getActiveTimeline().getCommitsAndCompactionTimeline(),
-        config.getCleanerPolicy(),
-        config.getCleanerCommitsRetained(),
-        Instant.now(),
-        config.getCleanerHoursRetained(),
-        hoodieTable.getMetaClient().getTableConfig().getTimelineTimezone());
+    if (!earliestCommitToRetain.isPresent()) {
+      earliestCommitToRetain = CleanerUtils.getEarliestCommitToRetain(
+          
hoodieTable.getMetaClient().getActiveTimeline().getCommitsAndCompactionTimeline(),
+          config.getCleanerPolicy(),
+          config.getCleanerCommitsRetained(),
+          Instant.now(),
+          config.getCleanerHoursRetained(),
+          hoodieTable.getMetaClient().getTableConfig().getTimelineTimezone());
+    }
+    return earliestCommitToRetain;
+  }
+
+  /**
+   * Returns earliest commit to not archive to assist with guarding archival.
+   * @return
+   */
+  public Option<String> getEarliestCommitToNotArchive() {
+    // compute only if not set
+    if (!earliestCommitToNotArchiveOpt.isPresent() && 
config.getCleanerPolicy() != HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) { 
// earliest commit to retain and earliest commit to not archive
+      // makes sense only when clean policy is num commits based or hours 
based.
+      Option<HoodieInstant> earliestToRetain = getEarliestCommitToRetain();
+      String earliestCommitToNotArchive = null;
+      if (!savepointedTimestamps.isEmpty()) { // if savepoints are found.
+        // find the first savepoint timestamp
+        String firstSavepoint = 
savepointedTimestamps.stream().sorted().findFirst().get();
+        earliestCommitToNotArchive = firstSavepoint;
+
+        // earliest commit to not archive = min (earliest commit to retain, 
earliest commit to not archive)
+        if (earliestToRetain.isPresent()) {
+          String earliestToRetainTimestamp = 
earliestToRetain.get().getTimestamp();
+          // When earliestToRetainTimestamp is greater than first savepoint 
timestamp but there are no
+          // replace commits between the first savepoint and the 
earliestToRetainTimestamp, we can set the
+          // earliestCommitToNotArchive to earliestToRetainTimestamp
+          if (HoodieTimeline.compareTimestamps(earliestToRetainTimestamp, 
HoodieTimeline.GREATER_THAN, firstSavepoint)) {
+            HoodieTimeline replaceTimeline = 
hoodieTable.getActiveTimeline().getCompletedReplaceTimeline().findInstantsInClosedRange(firstSavepoint,
 earliestToRetainTimestamp);
+            earliestCommitToNotArchive = replaceTimeline.empty() ? 
earliestToRetainTimestamp : firstSavepoint;
+            LOG.info(String.format("Setting Earliest Commit to Not Archive to 
%s since replace timeline is empty, earliestCommitToRetain: %s, total list of 
savepointed timestamps : %s",
+                earliestCommitToNotArchive, earliestToRetain, 
Arrays.toString(savepointedTimestamps.toArray())));
+          } else {
+            earliestCommitToNotArchive = earliestToRetainTimestamp;
+            LOG.info(String.format("Setting Earliest Commit to Not Archive to 
%s, earliestCommitToRetain: %s, total list of savepointed timestamps : %s", 
earliestCommitToNotArchive,
+                earliestToRetain, 
Arrays.toString(savepointedTimestamps.toArray())));
+          }
+        }
+      } else {
+        // no savepoints found.
+        // lets set the value regardless. Would help us differentiate older 
versions of hudi where this will not be set vs latest version where this will 
be set to either earliest savepoint
+        // or the earliest commit to retain.
+        if (earliestToRetain.isPresent()) {
+          LOG.info(String.format("Setting Earliest Commit to Not Archive same 
as Earliest commit to retain to %s", earliestToRetain.get().getTimestamp()));
+          earliestCommitToNotArchive = earliestToRetain.get().getTimestamp();
+        }
+      }
+      earliestCommitToNotArchiveOpt = 
Option.ofNullable(earliestCommitToNotArchive);

Review Comment:
   Can we use a separate variable like `skippedReplacedSavepoints` apart from 
the `earliestToRetainTimestamp` ? And let the archiver to decide which one to 
use.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to