[GitHub] [incubator-hudi] hddong commented on a change in pull request #1558: [HUDI-796]: added deduping logic for upserts case

GitBox Wed, 20 May 2020 08:00:26 -0700


hddong commented on a change in pull request #1558:
URL: https://github.com/apache/incubator-hudi/pull/1558#discussion_r428056297




##########
File path: hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java
##########
@@ -263,13 +265,26 @@ private static int compact(JavaSparkContext jsc, String 
basePath, String tableNa
   }
 
   private static int deduplicatePartitionPath(JavaSparkContext jsc, String 
duplicatedPartitionPath,
-      String repairedOutputPath, String basePath, String dryRun) {
+      String repairedOutputPath, String basePath, boolean dryRun, String 
dedupeType) {
     DedupeSparkJob job = new DedupeSparkJob(basePath, duplicatedPartitionPath, 
repairedOutputPath, new SQLContext(jsc),
-        FSUtils.getFs(basePath, jsc.hadoopConfiguration()));
-    job.fixDuplicates(Boolean.parseBoolean(dryRun));
+        FSUtils.getFs(basePath, jsc.hadoopConfiguration()), 
getDedupeType(dedupeType));
+    job.fixDuplicates(dryRun);
     return 0;
   }
 
+  private static Enumeration.Value getDedupeType(String type) {
+    switch (type) {
+      case "insertType":
+        return DeDupeType.insertType();
+      case "updateType":
+        return DeDupeType.updateType();
+      case "upsertType":
+        return DeDupeType.upsertType();
+      default:
+        throw new IllegalArgumentException("Please provide valid dedupe 
type!");
+    }
+  }
+

Review comment:
       Can use `DeDupeType.withName("insertType")` instead.

##########
File path: hudi-cli/src/main/scala/org/apache/hudi/cli/DeDupeType.scala
##########
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.cli
+
+object DeDupeType extends Enumeration {
+
+  type dedupeType = Value
+
+  val insertType = Value("insertType")
+  val updateType = Value("updateType")
+  val upsertType = Value("upsertType")

Review comment:
       Can we make it all uppercase to keep the format uniform
   
https://github.com/apache/incubator-hudi/blob/74ecc27e920c70fa4598d8e5a696954203a5b127/hudi-common/src/main/java/org/apache/hudi/common/model/WriteOperationType.java#L30-L34

##########
File path: hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java
##########
@@ -263,13 +265,26 @@ private static int compact(JavaSparkContext jsc, String 
basePath, String tableNa
   }
 
   private static int deduplicatePartitionPath(JavaSparkContext jsc, String 
duplicatedPartitionPath,
-      String repairedOutputPath, String basePath, String dryRun) {
+      String repairedOutputPath, String basePath, boolean dryRun, String 
dedupeType) {
     DedupeSparkJob job = new DedupeSparkJob(basePath, duplicatedPartitionPath, 
repairedOutputPath, new SQLContext(jsc),
-        FSUtils.getFs(basePath, jsc.hadoopConfiguration()));
-    job.fixDuplicates(Boolean.parseBoolean(dryRun));
+        FSUtils.getFs(basePath, jsc.hadoopConfiguration()), 
getDedupeType(dedupeType));
+    job.fixDuplicates(dryRun);
     return 0;
   }
 
+  private static Enumeration.Value getDedupeType(String type) {
+    switch (type) {
+      case "insertType":
+        return DeDupeType.insertType();
+      case "updateType":
+        return DeDupeType.updateType();
+      case "upsertType":
+        return DeDupeType.upsertType();
+      default:
+        throw new IllegalArgumentException("Please provide valid dedupe 
type!");
+    }
+  }
+

Review comment:
       Can use `DeDupeType.withName("insertType")` instead?

##########
File path: 
hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java
##########
@@ -77,7 +77,9 @@ public String deduplicate(
           help = "Spark executor memory") final String sparkMemory,
       @CliOption(key = {"dryrun"},
           help = "Should we actually remove duplicates or just run and store 
result to repairedOutputPath",
-          unspecifiedDefaultValue = "true") final boolean dryRun)
+          unspecifiedDefaultValue = "true") final boolean dryRun,
+      @CliOption(key = {"dedupeType"}, help = "Check DeDupeType.scala for 
valid values",
+          unspecifiedDefaultValue = "insertType") final String dedupeType)

Review comment:
       It's better to show the three types in help string and have a type check 
at first line of command. 

##########
File path: hudi-cli/src/main/scala/org/apache/hudi/cli/DedupeSparkJob.scala
##########
@@ -98,34 +97,92 @@ class DedupeSparkJob(basePath: String,
         ON h.`_hoodie_record_key` = d.dupe_key
                       """
     val dupeMap = sqlContext.sql(dupeDataSql).collectAsList().groupBy(r => 
r.getString(0))
-    val fileToDeleteKeyMap = new HashMap[String, HashSet[String]]()
+    getDedupePlan(dupeMap)
+  }
 
-    // Mark all files except the one with latest commits for deletion
+  private def getDedupePlan(dupeMap: Map[String, Buffer[Row]]): 
HashMap[String, HashSet[String]] = {
+    val fileToDeleteKeyMap = new HashMap[String, HashSet[String]]()
     dupeMap.foreach(rt => {
       val (key, rows) = rt
-      var maxCommit = -1L
-
-      rows.foreach(r => {
-        val c = r(3).asInstanceOf[String].toLong
-        if (c > maxCommit)
-          maxCommit = c
-      })
-
-      rows.foreach(r => {
-        val c = r(3).asInstanceOf[String].toLong
-        if (c != maxCommit) {
-          val f = r(2).asInstanceOf[String].split("_")(0)
-          if (!fileToDeleteKeyMap.contains(f)) {
-            fileToDeleteKeyMap(f) = HashSet[String]()
-          }
-          fileToDeleteKeyMap(f).add(key)
-        }
-      })
+
+      dedupeType match {
+        case DeDupeType.updateType =>
+          /*
+          This corresponds to the case where all duplicates have been updated 
at least once.
+          Once updated, duplicates are bound to have same commit time unless 
forcefully modified.
+          */
+          rows.init.foreach(r => {
+            val f = r(2).asInstanceOf[String].split("_")(0)
+            if (!fileToDeleteKeyMap.contains(f)) {
+              fileToDeleteKeyMap(f) = HashSet[String]()
+            }
+            fileToDeleteKeyMap(f).add(key)
+          })
+        case DeDupeType.insertType =>
+          /*
+          This corresponds to the case where duplicates got created due to 
INSERT and have never been updated.
+          */
+          var maxCommit = -1L
+
+          rows.foreach(r => {
+            val c = r(3).asInstanceOf[String].toLong
+            if (c > maxCommit)
+              maxCommit = c
+          })
+          rows.foreach(r => {
+            val c = r(3).asInstanceOf[String].toLong
+            if (c != maxCommit) {
+              val f = r(2).asInstanceOf[String].split("_")(0)
+              if (!fileToDeleteKeyMap.contains(f)) {
+                fileToDeleteKeyMap(f) = HashSet[String]()
+              }
+              fileToDeleteKeyMap(f).add(key)
+            }
+          })
+
+        case DeDupeType.upsertType =>
+          /*
+          This corresponds to the case where duplicates got created as a 
result of inserts as well as updates,
+          i.e few duplicate records have been updated, while others were never 
updated.
+           */
+          var maxCommit = -1L
+
+          rows.foreach(r => {
+            val c = r(3).asInstanceOf[String].toLong
+            if (c > maxCommit)
+              maxCommit = c
+          })
+          val rowsWithMaxCommit = new ListBuffer[Row]()
+          rows.foreach(r => {
+            val c = r(3).asInstanceOf[String].toLong
+            if (c != maxCommit) {
+              val f = r(2).asInstanceOf[String].split("_")(0)
+              if (!fileToDeleteKeyMap.contains(f)) {
+                fileToDeleteKeyMap(f) = HashSet[String]()
+              }
+              fileToDeleteKeyMap(f).add(key)
+            } else {
+              rowsWithMaxCommit += r
+            }
+          })
+
+          rowsWithMaxCommit.toList.init.foreach(r => {
+            val f = r(2).asInstanceOf[String].split("_")(0)
+            if (!fileToDeleteKeyMap.contains(f)) {
+              fileToDeleteKeyMap(f) = HashSet[String]()
+            }
+            fileToDeleteKeyMap(f).add(key)
+          })
+
+        case _ => throw new IllegalArgumentException("Please provide valid 
type for deduping!")
+      }
     })
+    LOG.debug("fileToDeleteKeyMap size : " + fileToDeleteKeyMap.size + ", map: 
" + fileToDeleteKeyMap)

Review comment:
       Can we use `$` to get value? like:
   
https://github.com/apache/incubator-hudi/blob/74ecc27e920c70fa4598d8e5a696954203a5b127/hudi-cli/src/main/scala/org/apache/hudi/cli/DedupeSparkJob.scala#L144




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [incubator-hudi] hddong commented on a change in pull request #1558: [HUDI-796]: added deduping logic for upserts case

Reply via email to