[ https://issues.apache.org/jira/browse/HIVE-25790?focusedWorklogId=823024&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-823024 ]
ASF GitHub Bot logged work on HIVE-25790: ----------------------------------------- Author: ASF GitHub Bot Created on: 03/Nov/22 10:36 Start Date: 03/Nov/22 10:36 Worklog Time Spent: 10m Work Description: deniskuzZ commented on code in PR #3582: URL: https://github.com/apache/hive/pull/3582#discussion_r1012655098 ########## ql/src/java/org/apache/hadoop/hive/ql/parse/repl/CopyUtils.java: ########## @@ -492,20 +495,94 @@ private void doRegularCopyOnce(FileSystem sourceFs, List<Path> srcList, throw new IOException(e); } } else { - //Destination should be empty + // Destination should have identical files only if (overWrite) { - deleteSubDirs(destinationFs, destination); + leaveIdenticalFilesOnly(sourceFs, paths, destinationFs, destination); } copyFilesBetweenFS(sourceFs, paths, destinationFs, destination, false, true); } } - private void deleteSubDirs(FileSystem fs, Path path) throws IOException { - //Delete the root path instead of doing a listing - //This is more optimised - delete(fs, path, true); - //Recreate just the Root folder - mkdirs(fs, path); + /** + * Leave identical files only between source and destination. It gets source paths. + */ + void leaveIdenticalFilesOnly(FileSystem sourceFs, Path[] srcPaths, + FileSystem destinationFs, Path destinationPath) throws IOException { + for (Path srcPath : srcPaths) { + leaveIdenticalFilesOnlyFileByFile(sourceFs, srcPath, + destinationFs, new Path(destinationPath, srcPath.getName())); + } + } + + /** + * Leave identical files only between source and destination. It gets a source path. + */ + private void leaveIdenticalFilesOnlyFileByFile(FileSystem sourceFs, Path srcPath, + FileSystem destinationFs, Path dstPath) throws IOException { + boolean toDelete = false; + Type dstType = getType(destinationFs, dstPath); + Type srcType = getType(sourceFs, srcPath); + LOG.debug("Source path: {}, type: {}", srcPath, srcType); + LOG.debug("Destination path: {}, type: {}", dstPath, dstType); + switch (dstType) { + case NONE: + // If destination is none, then no need to delete anything. + break; + case FILE: + // If destination is a file, + if (srcType == Type.FILE) { + // If source is a file, then delete the destination file if it is not same as source. + toDelete = !FileUtils.isSameFile(sourceFs, srcPath, destinationFs, dstPath); + LOG.debug("Same file?: {}", !toDelete); + } else { + // Otherwise, delete the destination file. + toDelete = true; + } + break; + case DIRECTORY: + // If destination is a directory, + if (srcType == Type.DIRECTORY) { + // If both are directories, visit for children of both. + Set<String> bothChildNames = Stream.concat( + Arrays.stream(sourceFs.listStatus(srcPath)) Review Comment: you are listing plenty of times: here and when calling getType, could this be optimized? Issue Time Tracking ------------------- Worklog Id: (was: 823024) Time Spent: 1h 20m (was: 1h 10m) > Make managed table copies handle updates (FileUtils) > ---------------------------------------------------- > > Key: HIVE-25790 > URL: https://issues.apache.org/jira/browse/HIVE-25790 > Project: Hive > Issue Type: Sub-task > Reporter: Haymant Mangla > Assignee: Teddy Choi > Priority: Major > Labels: pull-request-available > Time Spent: 1h 20m > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.20.10#820010)