Neer393 commented on code in PR #5987: URL: https://github.com/apache/hive/pull/5987#discussion_r2217702955
########## iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputCommitter.java: ########## @@ -786,14 +786,19 @@ private static FilesForCommit collectResults(int numTasks, ExecutorService execu .retry(3) .run(taskId -> { final String taskFileName = generateFileForCommitLocation(location, conf, jobContext.getJobID(), taskId); - final FilesForCommit files = readFileForCommit(taskFileName, io); - LOG.debug("Found Iceberg commitTask manifest file: {}\n{}", taskFileName, files); - - dataFiles.addAll(files.dataFiles()); - deleteFiles.addAll(files.deleteFiles()); - replacedDataFiles.addAll(files.replacedDataFiles()); - referencedDataFiles.addAll(files.referencedDataFiles()); - mergedAndDeletedFiles.addAll(files.mergedAndDeletedFiles()); + try { + final FilesForCommit files; + files = readFileForCommit(taskFileName, io); + LOG.debug("Found Iceberg commitTask manifest file: {}\n{}", taskFileName, files); + + dataFiles.addAll(files.dataFiles()); + deleteFiles.addAll(files.deleteFiles()); + replacedDataFiles.addAll(files.replacedDataFiles()); + referencedDataFiles.addAll(files.referencedDataFiles()); + mergedAndDeletedFiles.addAll(files.mergedAndDeletedFiles()); + } catch (NotFoundException e) { Review Comment: I have a possible solution In the process where we are committing task files, we maintain a list of task file names that have been committed and at last when we are done, we create a new file in the same directory maybe called commitTasksInfo and store this list there. Then when we are collecting results, we read this commitTasksInfo file and determine which commit files were created and only try retrieving them. How does this sound? @SourabhBadhya @deniskuzZ @abstractdog -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For additional commands, e-mail: gitbox-h...@hive.apache.org