This is an automated email from the ASF dual-hosted git repository.

voonhous pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new 91f3d227264d perf(clean): Avoid extra getPathInfo RPC per file during 
clean execution (#18963)
91f3d227264d is described below

commit 91f3d227264da8d5ca91c9943481867569135db6
Author: voonhous <[email protected]>
AuthorDate: Thu Jun 11 13:52:52 2026 +0800

    perf(clean): Avoid extra getPathInfo RPC per file during clean execution 
(#18963)
    
    * perf(clean): Avoid extra getPathInfo RPC per file during clean execution
    
    Cleaner plan file entries are always base/log/bootstrap file paths,
    never directories, so delete them directly with deleteFile instead of
    calling getPathInfo first to test isDirectory. This halves the RPC
    count of the clean execution phase on cloud storage.
    
    Directory-capable deletion is kept as deletePathAndGetResult and is
    only used for deleting whole partitions.
    
    * review: Rename to deleteDirAndGetResult and drop the isDirectory probe
    
    Only partition directories reach this method, so delete recursively
    without the getPathInfo type probe, saving that round trip per deleted
    partition as well. A missing directory on a retried clean surfaces as
    a false return from delete and is treated as already cleaned, same as
    the file variant.
    
    * review: Combine file and directory delete into one method with an 
explicit isDirectory param
    
    The two variants were identical apart from the deleteFile/deleteDirectory
    call and the log noun, so merge them and let the two call sites pass the
    path type explicitly.
---
 .../table/action/clean/CleanActionExecutor.java    | 45 +++++++++++++---------
 .../table/functional/TestCleanActionExecutor.java  | 20 +++++++---
 2 files changed, 41 insertions(+), 24 deletions(-)

diff --git 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java
 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java
index 2a636221ae60..0d27aa2a9f29 100644
--- 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java
+++ 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java
@@ -68,38 +68,45 @@ public class CleanActionExecutor<T, I, K, O> extends 
BaseActionExecutor<T, I, K,
     this.txnManager = new TransactionManager(config, table.getStorage());
   }
 
-  private static boolean deleteFileAndGetResult(HoodieStorage storage, String 
deletePathStr) {
+  /**
+   * Deletes the given path and returns whether it is gone afterwards. Cleaner 
plan file entries
+   * are always base/log/bootstrap file paths and partition deletions are 
always directories, so
+   * the caller passes the path type explicitly and no getPathInfo probe is 
needed.
+   *
+   * @param isDirectory true for a partition directory (deleted recursively), 
false for a file
+   */
+  private static boolean deleteAndGetResult(HoodieStorage storage, String 
deletePathStr, boolean isDirectory) {
     StoragePath deletePath = new StoragePath(deletePathStr);
-    log.debug("Working on delete path: {}", deletePath);
+    String pathType = isDirectory ? "directory" : "file";
+    log.debug("Working on deleting {}: {}", pathType, deletePath);
     try {
-      boolean deleteResult = storage.getPathInfo(deletePath).isDirectory()
-          ? storage.deleteDirectory(deletePath)
-          : storage.deleteFile(deletePath);
+      boolean deleteResult = isDirectory ? storage.deleteDirectory(deletePath) 
: storage.deleteFile(deletePath);
       if (deleteResult) {
-        log.debug("Cleaned file at path: {}", deletePath);
-      } else {
-        if (storage.exists(deletePath)) {
-          throw new HoodieIOException("Failed to delete path during clean 
execution " + deletePath);
-        } else {
-          log.debug("Already cleaned up file at path: {}", deletePath);
-        }
+        log.debug("Cleaned {}: {}", pathType, deletePath);
+        return true;
       }
-      return deleteResult;
+      if (storage.exists(deletePath)) {
+        throw new HoodieIOException("Failed to delete " + pathType + " during 
clean execution " + deletePath);
+      }
+      // Hadoop file systems report a missing path by returning false from 
delete instead of
+      // throwing FileNotFoundException, so this is the regular retried-clean 
case below.
+      log.debug("Already cleaned up {}: {}", pathType, deletePath);
+      return true;
     } catch (FileNotFoundException fio) {
-      // With cleanPlan being used for retried cleaning operations, its 
possible to clean a file twice if a file to be
+      // With cleanPlan being used for retried cleaning operations, its 
possible to clean a path twice if a path to be
       // deleted is not found, treat it as a success.  In other words, there 
is nothing else to be cleaned up on the
       // FileSystem, except for updating the MDT.  By returning success, we 
would remove the entry from MDT.
       return true;
     } catch (IOException e) {
       try {
         if (storage.exists(deletePath)) {
-          log.error("Delete file failed: {} and file still exists", 
deletePath, e);
+          log.error("Delete {} failed: {} and it still exists", pathType, 
deletePath, e);
           throw new HoodieIOException(e.getMessage(), e);
         }
-        log.warn("Delete file failed: {} but file does not exist", deletePath, 
e);
+        log.warn("Delete {} failed: {} but it does not exist", pathType, 
deletePath, e);
         return false;
       } catch (IOException ex) {
-        log.error("Delete file failed: {} with exception: {} and existence 
check also failed", deletePath, e, ex);
+        log.error("Delete {} failed: {} with exception: {} and existence check 
also failed", pathType, deletePath, e, ex);
         throw new HoodieIOException(ex.getMessage(), ex);
       }
     }
@@ -113,7 +120,7 @@ public class CleanActionExecutor<T, I, K, O> extends 
BaseActionExecutor<T, I, K,
       String partitionPath = partitionDelFileTuple.getLeft();
       StoragePath deletePath = new 
StoragePath(partitionDelFileTuple.getRight().getFilePath());
       String deletePathStr = deletePath.toString();
-      boolean deletedFileResult = deleteFileAndGetResult(storage, 
deletePathStr);
+      boolean deletedFileResult = deleteAndGetResult(storage, deletePathStr, 
false);
       final PartitionCleanStat partitionCleanStat =
           partitionCleanStatMap.computeIfAbsent(partitionPath, k -> new 
PartitionCleanStat(partitionPath));
       boolean isBootstrapBasePathFile = 
partitionDelFileTuple.getRight().isBootstrapBaseFile();
@@ -161,7 +168,7 @@ public class CleanActionExecutor<T, I, K, O> extends 
BaseActionExecutor<T, I, K,
         : Collections.emptyList();
     partitionsToBeDeleted.forEach(entry -> {
       if (!isNullOrEmpty(entry)) {
-        deleteFileAndGetResult(table.getStorage(), 
table.getMetaClient().getBasePath() + "/" + entry);
+        deleteAndGetResult(table.getStorage(), 
table.getMetaClient().getBasePath() + "/" + entry, true);
       }
     });
 
diff --git 
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanActionExecutor.java
 
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanActionExecutor.java
index 10b9891cb27f..f7fee3437ab0 100644
--- 
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanActionExecutor.java
+++ 
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanActionExecutor.java
@@ -66,7 +66,9 @@ import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
 import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
 
 /**
@@ -161,23 +163,26 @@ public class TestCleanActionExecutor {
 
     CleanActionExecutor cleanActionExecutor = new CleanActionExecutor(context, 
config, mockHoodieTable, "002");
     if (failureType == CleanFailureType.TRUE_ON_DELETE) {
-      assertCleanExecutionSuccess(cleanActionExecutor, filePath);
+      assertCleanExecutionSuccess(cleanActionExecutor, filePath, true);
     } else if (failureType == 
CleanFailureType.FALSE_ON_DELETE_IS_EXISTS_FALSE) {
-      assertCleanExecutionSuccess(cleanActionExecutor, filePath);
+      // a missing file on a retried clean counts as a successful delete so 
its MDT entry is removed
+      assertCleanExecutionSuccess(cleanActionExecutor, filePath, true);
     } else if (failureType == CleanFailureType.FALSE_ON_DELETE_IS_EXISTS_TRUE) 
{
       assertCleanExecutionFailure(cleanActionExecutor);
     } else if (failureType == CleanFailureType.FILE_NOT_FOUND_EXC_ON_DELETE) {
-      assertCleanExecutionSuccess(cleanActionExecutor, filePath);
+      assertCleanExecutionSuccess(cleanActionExecutor, filePath, true);
     } else if (failureType == CleanFailureType.IO_EXCEPTION) {
       assertCleanExecutionFailure(cleanActionExecutor);
     } else if (failureType == CleanFailureType.IO_EXCEPTION_AND_EXISTS) {
       assertCleanExecutionFailure(cleanActionExecutor);
     } else if (failureType == CleanFailureType.IO_EXCEPTION_BUT_NOT_EXISTS) {
-      assertCleanExecutionSuccess(cleanActionExecutor, filePath);
+      assertCleanExecutionSuccess(cleanActionExecutor, filePath, false);
     } else {
       // run time exception
       assertCleanExecutionFailure(cleanActionExecutor);
     }
+    // file deletions must not stat the path first; deletes go straight to 
storage
+    verify(storage, never()).getPathInfo(filePath);
   }
 
   private void assertCleanExecutionFailure(CleanActionExecutor 
cleanActionExecutor) {
@@ -186,11 +191,16 @@ public class TestCleanActionExecutor {
     });
   }
 
-  private void assertCleanExecutionSuccess(CleanActionExecutor 
cleanActionExecutor, StoragePath filePath) {
+  private void assertCleanExecutionSuccess(CleanActionExecutor 
cleanActionExecutor, StoragePath filePath, boolean expectFileDeleteSuccess) {
     HoodieCleanMetadata cleanMetadata = cleanActionExecutor.execute();
     assertTrue(cleanMetadata.getPartitionMetadata().containsKey(PARTITION1));
     HoodieCleanPartitionMetadata cleanPartitionMetadata = 
cleanMetadata.getPartitionMetadata().get(PARTITION1);
     
assertTrue(cleanPartitionMetadata.getDeletePathPatterns().contains(filePath.getName()));
+    if (expectFileDeleteSuccess) {
+      
assertTrue(cleanPartitionMetadata.getSuccessDeleteFiles().contains(filePath.getName()));
+    } else {
+      
assertTrue(cleanPartitionMetadata.getFailedDeleteFiles().contains(filePath.getName()));
+    }
   }
 
   private static HoodieWriteConfig getCleanByCommitsConfig() {

Reply via email to