deniskuzZ commented on code in PR #3582:
URL: https://github.com/apache/hive/pull/3582#discussion_r1012643637
##########
ql/src/java/org/apache/hadoop/hive/ql/parse/repl/CopyUtils.java:
##########
@@ -492,20 +495,94 @@ private void doRegularCopyOnce(FileSystem sourceFs,
List<Path> srcList,
throw new IOException(e);
}
} else {
- //Destination should be empty
+ // Destination should have identical files only
if (overWrite) {
- deleteSubDirs(destinationFs, destination);
+ leaveIdenticalFilesOnly(sourceFs, paths, destinationFs, destination);
}
copyFilesBetweenFS(sourceFs, paths, destinationFs, destination, false,
true);
}
}
- private void deleteSubDirs(FileSystem fs, Path path) throws IOException {
- //Delete the root path instead of doing a listing
- //This is more optimised
- delete(fs, path, true);
- //Recreate just the Root folder
- mkdirs(fs, path);
+ /**
+ * Leave identical files only between source and destination. It gets source
paths.
+ */
+ void leaveIdenticalFilesOnly(FileSystem sourceFs, Path[] srcPaths,
+ FileSystem destinationFs, Path destinationPath) throws IOException {
+ for (Path srcPath : srcPaths) {
+ leaveIdenticalFilesOnlyFileByFile(sourceFs, srcPath,
+ destinationFs, new Path(destinationPath, srcPath.getName()));
+ }
+ }
+
+ /**
+ * Leave identical files only between source and destination. It gets a
source path.
+ */
+ private void leaveIdenticalFilesOnlyFileByFile(FileSystem sourceFs, Path
srcPath,
+ FileSystem destinationFs, Path dstPath) throws IOException {
+ boolean toDelete = false;
+ Type dstType = getType(destinationFs, dstPath);
+ Type srcType = getType(sourceFs, srcPath);
+ LOG.debug("Source path: {}, type: {}", srcPath, srcType);
+ LOG.debug("Destination path: {}, type: {}", dstPath, dstType);
+ switch (dstType) {
+ case NONE:
+ // If destination is none, then no need to delete anything.
+ break;
+ case FILE:
+ // If destination is a file,
+ if (srcType == Type.FILE) {
+ // If source is a file, then delete the destination file if it is not
same as source.
+ toDelete = !FileUtils.isSameFile(sourceFs, srcPath, destinationFs,
dstPath);
+ LOG.debug("Same file?: {}", !toDelete);
+ } else {
+ // Otherwise, delete the destination file.
+ toDelete = true;
Review Comment:
is it possible that srcType and dstType would be different?
##########
ql/src/java/org/apache/hadoop/hive/ql/parse/repl/CopyUtils.java:
##########
@@ -492,20 +495,94 @@ private void doRegularCopyOnce(FileSystem sourceFs,
List<Path> srcList,
throw new IOException(e);
}
} else {
- //Destination should be empty
+ // Destination should have identical files only
if (overWrite) {
- deleteSubDirs(destinationFs, destination);
+ leaveIdenticalFilesOnly(sourceFs, paths, destinationFs, destination);
}
copyFilesBetweenFS(sourceFs, paths, destinationFs, destination, false,
true);
}
}
- private void deleteSubDirs(FileSystem fs, Path path) throws IOException {
- //Delete the root path instead of doing a listing
- //This is more optimised
- delete(fs, path, true);
- //Recreate just the Root folder
- mkdirs(fs, path);
+ /**
+ * Leave identical files only between source and destination. It gets source
paths.
+ */
+ void leaveIdenticalFilesOnly(FileSystem sourceFs, Path[] srcPaths,
+ FileSystem destinationFs, Path destinationPath) throws IOException {
+ for (Path srcPath : srcPaths) {
+ leaveIdenticalFilesOnlyFileByFile(sourceFs, srcPath,
+ destinationFs, new Path(destinationPath, srcPath.getName()));
+ }
+ }
+
+ /**
+ * Leave identical files only between source and destination. It gets a
source path.
+ */
+ private void leaveIdenticalFilesOnlyFileByFile(FileSystem sourceFs, Path
srcPath,
+ FileSystem destinationFs, Path dstPath) throws IOException {
+ boolean toDelete = false;
+ Type dstType = getType(destinationFs, dstPath);
+ Type srcType = getType(sourceFs, srcPath);
+ LOG.debug("Source path: {}, type: {}", srcPath, srcType);
+ LOG.debug("Destination path: {}, type: {}", dstPath, dstType);
+ switch (dstType) {
+ case NONE:
+ // If destination is none, then no need to delete anything.
+ break;
+ case FILE:
+ // If destination is a file,
+ if (srcType == Type.FILE) {
+ // If source is a file, then delete the destination file if it is not
same as source.
+ toDelete = !FileUtils.isSameFile(sourceFs, srcPath, destinationFs,
dstPath);
+ LOG.debug("Same file?: {}", !toDelete);
+ } else {
+ // Otherwise, delete the destination file.
+ toDelete = true;
+ }
+ break;
+ case DIRECTORY:
+ // If destination is a directory,
+ if (srcType == Type.DIRECTORY) {
+ // If both are directories, visit for children of both.
+ Set<String> bothChildNames = Stream.concat(
Review Comment:
why do you need concat? I think idea was to remove target-only files that
are not identical to the source. You should only compare the common set.
##########
ql/src/java/org/apache/hadoop/hive/ql/parse/repl/CopyUtils.java:
##########
@@ -492,20 +495,94 @@ private void doRegularCopyOnce(FileSystem sourceFs,
List<Path> srcList,
throw new IOException(e);
}
} else {
- //Destination should be empty
+ // Destination should have identical files only
if (overWrite) {
- deleteSubDirs(destinationFs, destination);
+ leaveIdenticalFilesOnly(sourceFs, paths, destinationFs, destination);
}
copyFilesBetweenFS(sourceFs, paths, destinationFs, destination, false,
true);
}
}
- private void deleteSubDirs(FileSystem fs, Path path) throws IOException {
- //Delete the root path instead of doing a listing
- //This is more optimised
- delete(fs, path, true);
- //Recreate just the Root folder
- mkdirs(fs, path);
+ /**
+ * Leave identical files only between source and destination. It gets source
paths.
+ */
+ void leaveIdenticalFilesOnly(FileSystem sourceFs, Path[] srcPaths,
+ FileSystem destinationFs, Path destinationPath) throws IOException {
+ for (Path srcPath : srcPaths) {
+ leaveIdenticalFilesOnlyFileByFile(sourceFs, srcPath,
+ destinationFs, new Path(destinationPath, srcPath.getName()));
+ }
+ }
+
+ /**
+ * Leave identical files only between source and destination. It gets a
source path.
+ */
+ private void leaveIdenticalFilesOnlyFileByFile(FileSystem sourceFs, Path
srcPath,
+ FileSystem destinationFs, Path dstPath) throws IOException {
+ boolean toDelete = false;
+ Type dstType = getType(destinationFs, dstPath);
+ Type srcType = getType(sourceFs, srcPath);
+ LOG.debug("Source path: {}, type: {}", srcPath, srcType);
+ LOG.debug("Destination path: {}, type: {}", dstPath, dstType);
+ switch (dstType) {
+ case NONE:
+ // If destination is none, then no need to delete anything.
+ break;
+ case FILE:
+ // If destination is a file,
+ if (srcType == Type.FILE) {
+ // If source is a file, then delete the destination file if it is not
same as source.
+ toDelete = !FileUtils.isSameFile(sourceFs, srcPath, destinationFs,
dstPath);
+ LOG.debug("Same file?: {}", !toDelete);
+ } else {
+ // Otherwise, delete the destination file.
+ toDelete = true;
+ }
+ break;
+ case DIRECTORY:
+ // If destination is a directory,
+ if (srcType == Type.DIRECTORY) {
+ // If both are directories, visit for children of both.
+ Set<String> bothChildNames = Stream.concat(
+ Arrays.stream(sourceFs.listStatus(srcPath))
Review Comment:
you are listing plenty of times, here and when calling getType
##########
ql/src/java/org/apache/hadoop/hive/ql/parse/repl/CopyUtils.java:
##########
@@ -492,20 +495,94 @@ private void doRegularCopyOnce(FileSystem sourceFs,
List<Path> srcList,
throw new IOException(e);
}
} else {
- //Destination should be empty
+ // Destination should have identical files only
if (overWrite) {
- deleteSubDirs(destinationFs, destination);
+ leaveIdenticalFilesOnly(sourceFs, paths, destinationFs, destination);
}
copyFilesBetweenFS(sourceFs, paths, destinationFs, destination, false,
true);
}
}
- private void deleteSubDirs(FileSystem fs, Path path) throws IOException {
- //Delete the root path instead of doing a listing
- //This is more optimised
- delete(fs, path, true);
- //Recreate just the Root folder
- mkdirs(fs, path);
+ /**
+ * Leave identical files only between source and destination. It gets source
paths.
+ */
+ void leaveIdenticalFilesOnly(FileSystem sourceFs, Path[] srcPaths,
+ FileSystem destinationFs, Path destinationPath) throws IOException {
+ for (Path srcPath : srcPaths) {
+ leaveIdenticalFilesOnlyFileByFile(sourceFs, srcPath,
+ destinationFs, new Path(destinationPath, srcPath.getName()));
+ }
+ }
+
+ /**
+ * Leave identical files only between source and destination. It gets a
source path.
+ */
+ private void leaveIdenticalFilesOnlyFileByFile(FileSystem sourceFs, Path
srcPath,
+ FileSystem destinationFs, Path dstPath) throws IOException {
+ boolean toDelete = false;
+ Type dstType = getType(destinationFs, dstPath);
+ Type srcType = getType(sourceFs, srcPath);
+ LOG.debug("Source path: {}, type: {}", srcPath, srcType);
+ LOG.debug("Destination path: {}, type: {}", dstPath, dstType);
+ switch (dstType) {
+ case NONE:
Review Comment:
formatting
##########
common/src/java/org/apache/hadoop/hive/common/FileUtils.java:
##########
@@ -773,12 +816,57 @@ public static boolean copy(FileSystem srcFS, FileStatus
srcStatus, FileSystem ds
return deleteSource ? srcFS.delete(src, true) : true;
}
+ /**
+ * Checks if the source and destination are the same file. It follows the
same logic as
+ * https://hadoop.apache.org/docs/stable/hadoop-distcp/DistCp.html .
+ */
+ public static boolean isSameFile(FileSystem srcFS, Path src, FileSystem
dstFS, Path dst)
+ throws IOException {
+ // When both file systems are HDFS, use strong check conditions;
+ // block size, length, checksum.
+ FileStatus srcStatus = srcFS.getFileStatus(src);
+ if (srcFS.getScheme().equals("hdfs") && dstFS.getScheme().equals("hdfs")) {
+ if (!dstFS.exists(dst)) {
+ return false;
+ }
+ FileStatus dstStatus = dstFS.getFileStatus(dst);
Review Comment:
think about if we can reduce fs listing and use a recursive iterator:
````
RemoteIterator<FileStatus> it = FileUtils.listStatusIterator(fs, path,
FileUtils.HIDDEN_FILES_PATH_FILTER);
````
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]