This is an automated email from the ASF dual-hosted git repository. pwason pushed a commit to branch release-0.14.0 in repository https://gitbox.apache.org/repos/asf/hudi.git
commit 8f07023948ea4b22f843b4b89602e887f6b56ab2 Author: leosanqing <liurongtong...@qq.com> AuthorDate: Mon Aug 14 11:28:14 2023 +0800 [HUDI-6675] Fix Clean action will delete the whole table (#9413) The clean action mistakenly delete the whole table when the table is non-partitioned. --------- Co-authored-by: Sagar Sumit <sagarsumi...@gmail.com> --- .../table/action/clean/CleanActionExecutor.java | 10 ++++- .../java/org/apache/hudi/table/TestCleaner.java | 51 ++++++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java index c04f1ba8f21..05e1056324a 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java @@ -54,6 +54,8 @@ import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty; + public class CleanActionExecutor<T, I, K, O> extends BaseActionExecutor<T, I, K, O, HoodieCleanMetadata> { private static final long serialVersionUID = 1L; @@ -144,10 +146,14 @@ public class CleanActionExecutor<T, I, K, O> extends BaseActionExecutor<T, I, K, Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); - List<String> partitionsToBeDeleted = cleanerPlan.getPartitionsToBeDeleted() != null ? cleanerPlan.getPartitionsToBeDeleted() : new ArrayList<>(); + List<String> partitionsToBeDeleted = table.getMetaClient().getTableConfig().isTablePartitioned() && cleanerPlan.getPartitionsToBeDeleted() != null + ? cleanerPlan.getPartitionsToBeDeleted() + : new ArrayList<>(); partitionsToBeDeleted.forEach(entry -> { try { - deleteFileAndGetResult(table.getMetaClient().getFs(), table.getMetaClient().getBasePath() + "/" + entry); + if (!isNullOrEmpty(entry)) { + deleteFileAndGetResult(table.getMetaClient().getFs(), table.getMetaClient().getBasePath() + "/" + entry); + } } catch (IOException e) { LOG.warn("Partition deletion failed " + entry); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java index f8d37e859d8..c2aceae0b52 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java @@ -36,6 +36,7 @@ import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; @@ -94,6 +95,7 @@ import java.util.stream.Stream; import scala.Tuple3; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.NO_PARTITION_PATH; import static org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime; import static org.apache.hudi.common.testutils.HoodieTestUtils.DEFAULT_PARTITION_PATHS; import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; @@ -352,6 +354,55 @@ public class TestCleaner extends HoodieCleanerTestBase { } } + /** + * Test clean non-partitioned table. + * This test is to ensure that the clean action does not clean the whole table data. + */ + @Test + public void testCleanNonPartitionedTable() throws IOException { + HoodieWriteConfig writeConfig = getConfigBuilder().withPath(basePath) + .withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder() + .withEnableBackupForRemoteFileSystemView(false) + .build()) + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withAutoClean(false) + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) + .retainCommits(1) + .build()) + .withEmbeddedTimelineServerEnabled(false).build(); + // datagen for non-partitioned table + initTestDataGenerator(new String[] {NO_PARTITION_PATH}); + // init non-partitioned table + HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, HoodieFileFormat.PARQUET, + true, "org.apache.hudi.keygen.NonpartitionedKeyGenerator", true); + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(context, writeConfig)) { + String instantTime; + for (int idx = 0; idx < 3; ++idx) { + instantTime = HoodieActiveTimeline.createNewInstantTime(); + List<HoodieRecord> records = dataGen.generateInserts(instantTime, 1); + client.startCommitWithTime(instantTime); + client.insert(jsc.parallelize(records, 1), instantTime).collect(); + } + + instantTime = HoodieActiveTimeline.createNewInstantTime(); + HoodieTable table = HoodieSparkTable.create(writeConfig, context); + Option<HoodieCleanerPlan> cleanPlan = table.scheduleCleaning(context, instantTime, Option.empty()); + assertEquals(cleanPlan.get().getPartitionsToBeDeleted().size(), 0); + assertEquals(cleanPlan.get().getFilePathsToBeDeletedPerPartition().get(NO_PARTITION_PATH).size(), 1); + table.getMetaClient().reloadActiveTimeline(); + String filePathToClean = cleanPlan.get().getFilePathsToBeDeletedPerPartition().get(NO_PARTITION_PATH).get(0).getFilePath(); + // clean + HoodieCleanMetadata cleanMetadata = table.clean(context, instantTime); + // check the cleaned file + assertEquals(cleanMetadata.getPartitionMetadata().get(NO_PARTITION_PATH).getSuccessDeleteFiles().size(), 1); + assertTrue(filePathToClean.contains(cleanMetadata.getPartitionMetadata().get(NO_PARTITION_PATH).getSuccessDeleteFiles().get(0))); + // ensure table is not fully cleaned and has a file group + assertTrue(FSUtils.isTableExists(basePath, fs)); + assertTrue(table.getFileSystemView().getAllFileGroups(NO_PARTITION_PATH).findAny().isPresent()); + } + } + /** * Tests no more than 1 clean is scheduled if hoodie.clean.allow.multiple config is set to false. */