n3nash commented on a change in pull request #600:  Timeline Service with 
Incremental View Syncing support
URL: https://github.com/apache/incubator-hudi/pull/600#discussion_r274221173
 
 

 ##########
 File path: 
hoodie-common/src/main/java/com/uber/hoodie/common/table/view/IncrementalTimelineSyncFileSystemView.java
 ##########
 @@ -0,0 +1,329 @@
+/*
+ *  Copyright (c) 2019 Uber Technologies, Inc. (hoodie-dev-gr...@uber.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *           http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package com.uber.hoodie.common.table.view;
+
+import com.uber.hoodie.avro.model.HoodieCleanMetadata;
+import com.uber.hoodie.avro.model.HoodieCompactionPlan;
+import com.uber.hoodie.avro.model.HoodieRestoreMetadata;
+import com.uber.hoodie.avro.model.HoodieRollbackMetadata;
+import com.uber.hoodie.common.model.CompactionOperation;
+import com.uber.hoodie.common.model.FileSlice;
+import com.uber.hoodie.common.model.HoodieCommitMetadata;
+import com.uber.hoodie.common.model.HoodieDataFile;
+import com.uber.hoodie.common.model.HoodieFileGroup;
+import com.uber.hoodie.common.model.HoodieLogFile;
+import com.uber.hoodie.common.table.HoodieTimeline;
+import com.uber.hoodie.common.table.timeline.HoodieInstant;
+import com.uber.hoodie.common.util.AvroUtils;
+import com.uber.hoodie.common.util.CompactionUtils;
+import com.uber.hoodie.common.util.TimelineDiffHelper;
+import com.uber.hoodie.common.util.TimelineDiffHelper.TimelineDiffResult;
+import com.uber.hoodie.common.util.collection.Pair;
+import com.uber.hoodie.exception.HoodieException;
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.stream.Collectors;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+/**
+ * Adds the capability to incrementally sync the changes to file-system view 
as and when new instants gets completed.
+ */
+public abstract class IncrementalTimelineSyncFileSystemView extends 
AbstractTableFileSystemView {
+
+  private static Logger log = 
LogManager.getLogger(IncrementalTimelineSyncFileSystemView.class);
+
+  @Override
+  protected void runSync(HoodieTimeline oldTimeline, HoodieTimeline 
newTimeline) {
+    try {
+      TimelineDiffResult diffResult = 
TimelineDiffHelper.getNewInstantsForIncrementalSync(oldTimeline, newTimeline);
+      if (diffResult.canSyncIncrementally()) {
+        log.info("Doing incremental sync");
+        runIncrementalSync(newTimeline, diffResult);
+        log.info("Finished incremental sync");
+        // Reset timeline to latest
+        visibleActiveTimeline = newTimeline;
+      } else {
+        log.warn("Incremental Sync of timeline is deemed unsafe. Will revert 
to full syncing");
+        super.runSync(oldTimeline, newTimeline);
+      }
+    } catch (Exception ioe) {
+      log.error("Got exception trying to perform incremental sync. Reverting 
to complete sync", ioe);
+      super.runSync(oldTimeline, newTimeline);
+    }
+  }
+
+  /**
+   * Run incremental sync based on the diff result produced.
+   *
+   * @param timeline New Timeline
+   * @param diffResult Timeline Diff Result
+   */
+  private void runIncrementalSync(HoodieTimeline timeline, TimelineDiffResult 
diffResult) {
+
+    log.info("Timeline Diff Result is :" + diffResult);
+
+    // First remove pending compaction instants which were completed
+    diffResult.getFinishedCompactionInstants().stream().forEach(instant -> {
+      try {
+        removePendingCompactionInstant(timeline, instant);
+      } catch (IOException e) {
+        throw new HoodieException(e);
+      }
+    });
+
+    // Add new completed instants found in the latest timeline
+    diffResult.getNewlySeenInstants().stream()
+        .filter(instant -> instant.isCompleted() || 
instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION))
+        .forEach(instant -> {
+          try {
+            if (instant.getAction().equals(HoodieTimeline.COMMIT_ACTION)
+                || 
instant.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)) {
+              addCommitInstant(timeline, instant);
+            } else if 
(instant.getAction().equals(HoodieTimeline.RESTORE_ACTION)) {
+              addRestoreInstant(timeline, instant);
+            } else if 
(instant.getAction().equals(HoodieTimeline.CLEAN_ACTION)) {
+              addCleanInstant(timeline, instant);
+            } else if 
(instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)) {
+              addPendingCompactionInstant(timeline, instant);
+            } else if 
(instant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)) {
+              addRollbackInstant(timeline, instant);
+            }
+          } catch (IOException ioe) {
+            throw new HoodieException(ioe);
+          }
+        });
+  }
+
+  /**
+   * Remove Pending compaction instant
+   *
+   * @param timeline New Hoodie Timeline
+   * @param instant Compaction Instant to be removed
+   */
+  private void removePendingCompactionInstant(HoodieTimeline timeline, 
HoodieInstant instant) throws IOException {
+    log.info("Removing completed compaction instant (" + instant + ")");
+    HoodieCompactionPlan plan = CompactionUtils.getCompactionPlan(metaClient, 
instant.getTimestamp());
+    
removePendingCompactionOperations(CompactionUtils.getPendingCompactionOperations(instant,
 plan)
+        .map(instantPair -> Pair.of(instantPair.getValue().getKey(),
+            
CompactionOperation.convertFromAvroRecordInstance(instantPair.getValue().getValue()))));
+  }
+
+  /**
+   * Add newly found compaction instant
+   *
+   * @param timeline Hoodie Timeline
+   * @param instant Compaction Instant
+   */
+  private void addPendingCompactionInstant(HoodieTimeline timeline, 
HoodieInstant instant) throws IOException {
+    log.info("Syncing pending compaction instant (" + instant + ")");
+    HoodieCompactionPlan compactionPlan = 
CompactionUtils.getCompactionPlan(metaClient, instant.getTimestamp());
+    List<Pair<String, CompactionOperation>> pendingOps =
+        CompactionUtils.getPendingCompactionOperations(instant, 
compactionPlan).map(p -> Pair.of(p.getValue().getKey(),
+            
CompactionOperation.convertFromAvroRecordInstance(p.getValue().getValue()))).collect(Collectors.toList());
+    // First, update Pending compaction instants
+    addPendingCompactionOperations(pendingOps.stream());
+
+    Map<String, List<Pair<String, HoodieFileGroup>>> partitionToFileGroups =
+        pendingOps.stream().map(opPair -> {
+          String compactionInstantTime = opPair.getKey();
+          HoodieFileGroup fileGroup = new 
HoodieFileGroup(opPair.getValue().getFileGroupId(), timeline);
+          fileGroup.addNewFileSliceAtInstant(compactionInstantTime);
+          return Pair.of(compactionInstantTime, fileGroup);
+        }).collect(Collectors.groupingBy(x -> 
x.getValue().getPartitionPath()));
+    partitionToFileGroups.entrySet().forEach(entry -> {
+      if (isPartitionAvailableInStore(entry.getKey())) {
+        applyDeltaFileSlicesToPartitionView(entry.getKey(),
+            
entry.getValue().stream().map(Pair::getValue).collect(Collectors.toList()), 
DeltaApplyMode.ADD);
+      }
+    });
+  }
+
+  /**
+   * Add newly found commit/delta-commit instant
+   *
+   * @param timeline Hoodie Timeline
+   * @param instant Instant
+   */
+  private void addCommitInstant(HoodieTimeline timeline, HoodieInstant 
instant) throws IOException {
+    log.info("Syncing committed instant (" + instant + ")");
+    HoodieCommitMetadata commitMetadata = 
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(instant).get(),
+        HoodieCommitMetadata.class);
+    
commitMetadata.getPartitionToWriteStats().entrySet().stream().forEach(entry -> {
+      String partition = entry.getKey();
+      if (isPartitionAvailableInStore(partition)) {
+        log.info("Syncing partition (" + partition + ") of instant (" + 
instant + ")");
+        FileStatus[] statuses = entry.getValue().stream().map(p -> {
+          FileStatus status = new FileStatus(p.getFileSizeInBytes(), false, 0, 
0, 0, 0, null, null, null,
+              new Path(String.format("%s/%s", metaClient.getBasePath(), 
p.getPath())));
+          return status;
+        }).toArray(FileStatus[]::new);
+        List<HoodieFileGroup> fileGroups = buildFileGroups(statuses, 
timeline.filterCompletedAndCompactionInstants(),
+            false);
+        applyDeltaFileSlicesToPartitionView(partition, fileGroups, 
DeltaApplyMode.ADD);
+      } else {
+        log.warn("Skipping partition (" + partition + ") when syncing instant 
(" + instant + ") as it is not loaded");
+      }
+    });
+    log.info("Done Syncing committed instant (" + instant + ")");
+  }
+
+  /**
+   * Add newly found restore instant
+   *
+   * @param timeline Hoodie Timeline
+   * @param instant Restore Instant
+   */
+  private void addRestoreInstant(HoodieTimeline timeline, HoodieInstant 
instant) throws IOException {
+    log.info("Syncing restore instant (" + instant + ")");
+    HoodieRestoreMetadata metadata = AvroUtils.deserializeAvroMetadata(
+        timeline.getInstantDetails(instant).get(), 
HoodieRestoreMetadata.class);
+
+    Map<String, List<Pair<String, String>>> partitionFiles =
+        metadata.getHoodieRestoreMetadata().entrySet().stream().flatMap(entry 
-> {
+          return entry.getValue().stream().flatMap(e -> 
e.getPartitionMetadata().entrySet().stream().flatMap(e2 -> {
+            return e2.getValue().getSuccessDeleteFiles().stream().map(x -> 
Pair.of(e2.getKey(), x));
+          }));
+        }).collect(Collectors.groupingBy(Pair::getKey));
+    partitionFiles.entrySet().stream().forEach(e -> {
+      removeFileSlicesForPartition(timeline, instant, e.getKey(),
+          e.getValue().stream().map(x -> 
x.getValue()).collect(Collectors.toList()));
+    });
+    log.info("Done Syncing restore instant (" + instant + ")");
+  }
+
+  /**
+   * Add newly found rollback instant
+   *
+   * @param timeline Hoodie Timeline
+   * @param instant Rollback Instant
+   */
+  private void addRollbackInstant(HoodieTimeline timeline, HoodieInstant 
instant) throws IOException {
+    log.info("Syncing rollback instant (" + instant + ")");
+    HoodieRollbackMetadata metadata = AvroUtils.deserializeAvroMetadata(
+        timeline.getInstantDetails(instant).get(), 
HoodieRollbackMetadata.class);
+
+    metadata.getPartitionMetadata().entrySet().stream().forEach(e -> {
+      removeFileSlicesForPartition(timeline, instant, e.getKey(), 
e.getValue().getSuccessDeleteFiles());
+    });
+    log.info("Done Syncing rollback instant (" + instant + ")");
+  }
+
+  /**
+   * Add newly found clean instant
+   *
+   * @param timeline Timeline
+   * @param instant Clean instant
+   */
+  private void addCleanInstant(HoodieTimeline timeline, HoodieInstant instant) 
throws IOException {
+    log.info("Syncing cleaner instant (" + instant + ")");
+    HoodieCleanMetadata cleanMetadata = AvroUtils
+        
.deserializeHoodieCleanMetadata(timeline.getInstantDetails(instant).get());
+    cleanMetadata.getPartitionMetadata().entrySet().stream().forEach(entry -> {
+      removeFileSlicesForPartition(timeline, instant, entry.getKey(), 
entry.getValue().getSuccessDeleteFiles());
+    });
+    log.info("Done Syncing cleaner instant (" + instant + ")");
+  }
+
+  private void removeFileSlicesForPartition(HoodieTimeline timeline, 
HoodieInstant instant,
+      String partition, List<String> paths) {
+    if (isPartitionAvailableInStore(partition)) {
+      log.info("Removing file slices for partition (" + partition + ") for 
instant (" + instant + ")");
+      FileStatus[] statuses = paths.stream().map(p -> {
+        FileStatus status = new FileStatus();
+        status.setPath(new Path(p));
+        return status;
+      }).toArray(FileStatus[]::new);
+      List<HoodieFileGroup> fileGroups = buildFileGroups(statuses,
+          timeline.filterCompletedAndCompactionInstants(), false);
+      applyDeltaFileSlicesToPartitionView(partition, fileGroups, 
DeltaApplyMode.REMOVE);
+    } else {
+      log.warn("Skipping partition (" + partition + ") when syncing instant (" 
+ instant + ") as it is not loaded");
+    }
+  }
+
+  /**
+   * Apply mode whether to add or remove the delta view
+   */
+  enum DeltaApplyMode {
+    ADD,
+    REMOVE
+  }
+
+  /**
+   * Apply changes to partition file-system view. Base Implementation 
overwrites the entire partitions view assuming
+   * some sort of map (in-mem/disk-based) is used. For View implementation 
which supports fine-granular updates (e:g
+   * RocksDB), override this method.
+   *
+   * @param partition PartitionPath
+   * @param deltaFileGroups Changed file-slices aggregated as file-groups
+   * @param mode Delta Apply mode
+   */
+  protected void applyDeltaFileSlicesToPartitionView(String partition, 
List<HoodieFileGroup> deltaFileGroups,
 
 Review comment:
   So for all other implementations other than rocksdb, we reload the entire 
partition cache with new entries ? Does this mean that everything is 
incremental sync but after syncing the key is just reloaded or that even 
incremental sync is not really incremental ?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to