This is an automated email from the ASF dual-hosted git repository.
voonhous pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 91f3d227264d perf(clean): Avoid extra getPathInfo RPC per file during
clean execution (#18963)
91f3d227264d is described below
commit 91f3d227264da8d5ca91c9943481867569135db6
Author: voonhous <[email protected]>
AuthorDate: Thu Jun 11 13:52:52 2026 +0800
perf(clean): Avoid extra getPathInfo RPC per file during clean execution
(#18963)
* perf(clean): Avoid extra getPathInfo RPC per file during clean execution
Cleaner plan file entries are always base/log/bootstrap file paths,
never directories, so delete them directly with deleteFile instead of
calling getPathInfo first to test isDirectory. This halves the RPC
count of the clean execution phase on cloud storage.
Directory-capable deletion is kept as deletePathAndGetResult and is
only used for deleting whole partitions.
* review: Rename to deleteDirAndGetResult and drop the isDirectory probe
Only partition directories reach this method, so delete recursively
without the getPathInfo type probe, saving that round trip per deleted
partition as well. A missing directory on a retried clean surfaces as
a false return from delete and is treated as already cleaned, same as
the file variant.
* review: Combine file and directory delete into one method with an
explicit isDirectory param
The two variants were identical apart from the deleteFile/deleteDirectory
call and the log noun, so merge them and let the two call sites pass the
path type explicitly.
---
.../table/action/clean/CleanActionExecutor.java | 45 +++++++++++++---------
.../table/functional/TestCleanActionExecutor.java | 20 +++++++---
2 files changed, 41 insertions(+), 24 deletions(-)
diff --git
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java
index 2a636221ae60..0d27aa2a9f29 100644
---
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java
+++
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java
@@ -68,38 +68,45 @@ public class CleanActionExecutor<T, I, K, O> extends
BaseActionExecutor<T, I, K,
this.txnManager = new TransactionManager(config, table.getStorage());
}
- private static boolean deleteFileAndGetResult(HoodieStorage storage, String
deletePathStr) {
+ /**
+ * Deletes the given path and returns whether it is gone afterwards. Cleaner
plan file entries
+ * are always base/log/bootstrap file paths and partition deletions are
always directories, so
+ * the caller passes the path type explicitly and no getPathInfo probe is
needed.
+ *
+ * @param isDirectory true for a partition directory (deleted recursively),
false for a file
+ */
+ private static boolean deleteAndGetResult(HoodieStorage storage, String
deletePathStr, boolean isDirectory) {
StoragePath deletePath = new StoragePath(deletePathStr);
- log.debug("Working on delete path: {}", deletePath);
+ String pathType = isDirectory ? "directory" : "file";
+ log.debug("Working on deleting {}: {}", pathType, deletePath);
try {
- boolean deleteResult = storage.getPathInfo(deletePath).isDirectory()
- ? storage.deleteDirectory(deletePath)
- : storage.deleteFile(deletePath);
+ boolean deleteResult = isDirectory ? storage.deleteDirectory(deletePath)
: storage.deleteFile(deletePath);
if (deleteResult) {
- log.debug("Cleaned file at path: {}", deletePath);
- } else {
- if (storage.exists(deletePath)) {
- throw new HoodieIOException("Failed to delete path during clean
execution " + deletePath);
- } else {
- log.debug("Already cleaned up file at path: {}", deletePath);
- }
+ log.debug("Cleaned {}: {}", pathType, deletePath);
+ return true;
}
- return deleteResult;
+ if (storage.exists(deletePath)) {
+ throw new HoodieIOException("Failed to delete " + pathType + " during
clean execution " + deletePath);
+ }
+ // Hadoop file systems report a missing path by returning false from
delete instead of
+ // throwing FileNotFoundException, so this is the regular retried-clean
case below.
+ log.debug("Already cleaned up {}: {}", pathType, deletePath);
+ return true;
} catch (FileNotFoundException fio) {
- // With cleanPlan being used for retried cleaning operations, its
possible to clean a file twice if a file to be
+ // With cleanPlan being used for retried cleaning operations, its
possible to clean a path twice if a path to be
// deleted is not found, treat it as a success. In other words, there
is nothing else to be cleaned up on the
// FileSystem, except for updating the MDT. By returning success, we
would remove the entry from MDT.
return true;
} catch (IOException e) {
try {
if (storage.exists(deletePath)) {
- log.error("Delete file failed: {} and file still exists",
deletePath, e);
+ log.error("Delete {} failed: {} and it still exists", pathType,
deletePath, e);
throw new HoodieIOException(e.getMessage(), e);
}
- log.warn("Delete file failed: {} but file does not exist", deletePath,
e);
+ log.warn("Delete {} failed: {} but it does not exist", pathType,
deletePath, e);
return false;
} catch (IOException ex) {
- log.error("Delete file failed: {} with exception: {} and existence
check also failed", deletePath, e, ex);
+ log.error("Delete {} failed: {} with exception: {} and existence check
also failed", pathType, deletePath, e, ex);
throw new HoodieIOException(ex.getMessage(), ex);
}
}
@@ -113,7 +120,7 @@ public class CleanActionExecutor<T, I, K, O> extends
BaseActionExecutor<T, I, K,
String partitionPath = partitionDelFileTuple.getLeft();
StoragePath deletePath = new
StoragePath(partitionDelFileTuple.getRight().getFilePath());
String deletePathStr = deletePath.toString();
- boolean deletedFileResult = deleteFileAndGetResult(storage,
deletePathStr);
+ boolean deletedFileResult = deleteAndGetResult(storage, deletePathStr,
false);
final PartitionCleanStat partitionCleanStat =
partitionCleanStatMap.computeIfAbsent(partitionPath, k -> new
PartitionCleanStat(partitionPath));
boolean isBootstrapBasePathFile =
partitionDelFileTuple.getRight().isBootstrapBaseFile();
@@ -161,7 +168,7 @@ public class CleanActionExecutor<T, I, K, O> extends
BaseActionExecutor<T, I, K,
: Collections.emptyList();
partitionsToBeDeleted.forEach(entry -> {
if (!isNullOrEmpty(entry)) {
- deleteFileAndGetResult(table.getStorage(),
table.getMetaClient().getBasePath() + "/" + entry);
+ deleteAndGetResult(table.getStorage(),
table.getMetaClient().getBasePath() + "/" + entry, true);
}
});
diff --git
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanActionExecutor.java
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanActionExecutor.java
index 10b9891cb27f..f7fee3437ab0 100644
---
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanActionExecutor.java
+++
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanActionExecutor.java
@@ -66,7 +66,9 @@ import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
/**
@@ -161,23 +163,26 @@ public class TestCleanActionExecutor {
CleanActionExecutor cleanActionExecutor = new CleanActionExecutor(context,
config, mockHoodieTable, "002");
if (failureType == CleanFailureType.TRUE_ON_DELETE) {
- assertCleanExecutionSuccess(cleanActionExecutor, filePath);
+ assertCleanExecutionSuccess(cleanActionExecutor, filePath, true);
} else if (failureType ==
CleanFailureType.FALSE_ON_DELETE_IS_EXISTS_FALSE) {
- assertCleanExecutionSuccess(cleanActionExecutor, filePath);
+ // a missing file on a retried clean counts as a successful delete so
its MDT entry is removed
+ assertCleanExecutionSuccess(cleanActionExecutor, filePath, true);
} else if (failureType == CleanFailureType.FALSE_ON_DELETE_IS_EXISTS_TRUE)
{
assertCleanExecutionFailure(cleanActionExecutor);
} else if (failureType == CleanFailureType.FILE_NOT_FOUND_EXC_ON_DELETE) {
- assertCleanExecutionSuccess(cleanActionExecutor, filePath);
+ assertCleanExecutionSuccess(cleanActionExecutor, filePath, true);
} else if (failureType == CleanFailureType.IO_EXCEPTION) {
assertCleanExecutionFailure(cleanActionExecutor);
} else if (failureType == CleanFailureType.IO_EXCEPTION_AND_EXISTS) {
assertCleanExecutionFailure(cleanActionExecutor);
} else if (failureType == CleanFailureType.IO_EXCEPTION_BUT_NOT_EXISTS) {
- assertCleanExecutionSuccess(cleanActionExecutor, filePath);
+ assertCleanExecutionSuccess(cleanActionExecutor, filePath, false);
} else {
// run time exception
assertCleanExecutionFailure(cleanActionExecutor);
}
+ // file deletions must not stat the path first; deletes go straight to
storage
+ verify(storage, never()).getPathInfo(filePath);
}
private void assertCleanExecutionFailure(CleanActionExecutor
cleanActionExecutor) {
@@ -186,11 +191,16 @@ public class TestCleanActionExecutor {
});
}
- private void assertCleanExecutionSuccess(CleanActionExecutor
cleanActionExecutor, StoragePath filePath) {
+ private void assertCleanExecutionSuccess(CleanActionExecutor
cleanActionExecutor, StoragePath filePath, boolean expectFileDeleteSuccess) {
HoodieCleanMetadata cleanMetadata = cleanActionExecutor.execute();
assertTrue(cleanMetadata.getPartitionMetadata().containsKey(PARTITION1));
HoodieCleanPartitionMetadata cleanPartitionMetadata =
cleanMetadata.getPartitionMetadata().get(PARTITION1);
assertTrue(cleanPartitionMetadata.getDeletePathPatterns().contains(filePath.getName()));
+ if (expectFileDeleteSuccess) {
+
assertTrue(cleanPartitionMetadata.getSuccessDeleteFiles().contains(filePath.getName()));
+ } else {
+
assertTrue(cleanPartitionMetadata.getFailedDeleteFiles().contains(filePath.getName()));
+ }
}
private static HoodieWriteConfig getCleanByCommitsConfig() {