saxenapranav commented on code in PR #6716:
URL: https://github.com/apache/hadoop/pull/6716#discussion_r1577303182


##########
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CleanupJobStage.java:
##########
@@ -142,64 +154,104 @@ protected Result executeStage(
     }
 
     Outcome outcome = null;
-    IOException exception;
+    IOException exception = null;
+    boolean baseDirDeleted = false;
 
 
     // to delete.
     LOG.info("{}: Deleting job directory {}", getName(), baseDir);
+    final long directoryCount = args.directoryCount;
+    if (directoryCount > 0) {
+      // log the expected directory count, which drives duration in GCS
+      // and may cause timeouts on azure if the count is too high for a
+      // timely permissions tree scan.
+      LOG.info("{}: Expected directory count: {}", getName(), directoryCount);
+    }
 
+    progress();
+    // check and maybe execute parallel delete of task attempt dirs.
     if (args.deleteTaskAttemptDirsInParallel) {
-      // Attempt to do a parallel delete of task attempt dirs;
-      // don't overreact if a delete fails, but stop trying
-      // to delete the others, and fall back to deleting the
-      // job dir.
-      Path taskSubDir
-          = getStageConfig().getJobAttemptTaskSubDir();
-      try (DurationInfo info = new DurationInfo(LOG,
-          "parallel deletion of task attempts in %s",
-          taskSubDir)) {
-        RemoteIterator<FileStatus> dirs =
-            RemoteIterators.filteringRemoteIterator(
-                listStatusIterator(taskSubDir),
-                FileStatus::isDirectory);
-        TaskPool.foreach(dirs)
-            .executeWith(getIOProcessors())
-            .stopOnFailure()
-            .suppressExceptions(false)
-            .run(this::rmTaskAttemptDir);
-        getIOStatistics().aggregate((retrieveIOStatistics(dirs)));
-
-        if (getLastDeleteException() != null) {
-          // one of the task attempts failed.
-          throw getLastDeleteException();
+
+
+      if (args.parallelDeleteAttemptBaseDeleteFirst) {
+        // attempt to delete the base dir first.
+        // This can reduce ABFS delete load but may time out
+        // (which the fallback to parallel delete will handle).
+        // on GCS it is slow.
+        try (DurationInfo info = new DurationInfo(LOG, true,
+            "Initial delete of %s", baseDir)) {
+          exception = deleteOneDir(baseDir);
+          if (exception == null) {
+            // success: record this as the outcome,
+            outcome = Outcome.DELETED;
+            // and will skip the parallel delete
+            baseDirDeleted = true;
+          } else {
+            // failure: log and continue
+            LOG.warn("{}: Exception on initial attempt at deleting base dir {}"
+                    + " with directory count {}. Falling back to parallel 
delete",
+                getName(), baseDir, directoryCount, exception);
+          }
+        }
+      }
+      if (!baseDirDeleted) {
+        // no base delete attempted or it failed.
+        // Attempt to do a parallel delete of task attempt dirs;
+        // don't overreact if a delete fails, but stop trying
+        // to delete the others, and fall back to deleting the
+        // job dir.
+        Path taskSubDir
+            = getStageConfig().getJobAttemptTaskSubDir();
+        try (DurationInfo info = new DurationInfo(LOG, true,
+            "parallel deletion of task attempts in %s",
+            taskSubDir)) {
+          RemoteIterator<FileStatus> dirs =
+              RemoteIterators.filteringRemoteIterator(
+                  listStatusIterator(taskSubDir),
+                  FileStatus::isDirectory);
+          TaskPool.foreach(dirs)
+              .executeWith(getIOProcessors())
+              .stopOnFailure()
+              .suppressExceptions(false)
+              .run(this::rmTaskAttemptDir);
+          getIOStatistics().aggregate((retrieveIOStatistics(dirs)));
+
+          if (getLastDeleteException() != null) {
+            // one of the task attempts failed.
+            throw getLastDeleteException();

Review Comment:
   This will also register an exception raised in `deleteOneDir` in line 183. 
So, although there was no task failure, but would get recorded as task-failure 
in logs.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org

Reply via email to