[ https://issues.apache.org/jira/browse/HDFS-17120?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17747164#comment-17747164 ]
ASF GitHub Bot commented on HDFS-17120: --------------------------------------- sadanand48 commented on code in PR #5885: URL: https://github.com/apache/hadoop/pull/5885#discussion_r1274102593 ########## hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyListing.java: ########## @@ -305,6 +305,33 @@ public static CopyListing getCopyListing(Configuration configuration, } } + /** + * Public Factory method with which the appropriate Diff CopyListing implementation may be retrieved. + * @param configuration The input configuration. + * @param credentials Credentials object on which the FS delegation tokens are cached + * @param distCpSync DistcpSync object used to sync diffs between source and target. + * @return An instance of the appropriate CopyListing implementation. + * @throws java.io.IOException - Exception if any + */ + public static CopyListing getDiffCopyListing(Configuration configuration, + Credentials credentials, DistCpSync distCpSync) throws IOException { + String copyListingClassName = + configuration.get(DistCpConstants.CONF_LABEL_DIFF_COPY_LISTING_CLASS, + ""); + try { + Class<? extends CopyListing> copyListingClass = + configuration.getClass(DistCpConstants.CONF_LABEL_DIFF_COPY_LISTING_CLASS, + SimpleCopyListing.class, SimpleCopyListing.class); + copyListingClassName = copyListingClass.getName(); + Constructor<? extends CopyListing> constructor = + copyListingClass.getDeclaredConstructor(Configuration.class, + Credentials.class, DistCpSync.class); + return constructor.newInstance(configuration, credentials,distCpSync); Review Comment: Done. ########## hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java: ########## @@ -167,6 +169,66 @@ public void testDuplicates() { } } + @Test(timeout=10000) + public void testFlatDiffCopyListing() { + FileSystem fs = null; + try { + fs = FileSystem.get(getConf()); + List<Path> srcPaths = new ArrayList<Path>(); + srcPaths.add(new Path("/tmp/in")); + TestDistCpUtils.createFile(fs, "/tmp/in/src1/1.txt"); + TestDistCpUtils.createFile(fs, "/tmp/in/src2/1.txt"); + TestDistCpUtils.createFile(fs, "/tmp/in/src3/3.txt"); + TestDistCpUtils.createFile(fs, "/tmp/in/src3/4.txt"); + Path target = new Path("/tmp/out"); + Path listingFile = new Path("/tmp/list"); + // adding below flags useDiff & sync only to enable context.shouldUseSnapshotDiff() + final DistCpOptions options = new DistCpOptions.Builder(srcPaths, target) + .withUseDiff("snap1","snap2") + .withSyncFolder(true) + .build(); + final DistCpContext context = new DistCpContext(options); + Configuration configuration = getConf(); + configuration.set(DistCpConstants.CONF_LABEL_DIFF_COPY_LISTING_CLASS, + FlatDiffCopyListing.class.getName()); + DistCpSync distCpSync = Mockito.mock(DistCpSync.class); + // Create a dummy DiffInfo List that contains a directory + paths inside + // that directory as part of the diff. + + ArrayList<DiffInfo> diffs = new ArrayList<>(); + diffs.add( + new DiffInfo(new Path("/tmp/in/src3/"), new Path("/tmp/in/src3/"), + SnapshotDiffReport.DiffType.CREATE)); + diffs.add(new DiffInfo(new Path("/tmp/in/src3/3.txt"), + new Path("/tmp/in/src3/3.txt"), SnapshotDiffReport.DiffType.CREATE)); + diffs.add(new DiffInfo(new Path("/tmp/in/src3/4.txt"), + new Path("/tmp/in/src3/4.txt"), SnapshotDiffReport.DiffType.CREATE)); + Mockito.when(distCpSync.prepareDiffListForCopyListing()).thenReturn(diffs); + + CopyListing listing = + CopyListing.getDiffCopyListing(configuration, CREDENTIALS,distCpSync); + // won't throw DuplicateFileException as copyListing is FlatDiffCopyListing. + listing.buildListing(listingFile, context); + + // Throws DuplicateFileException when copyListing is SimpleCopyListing + // as it recursively traverses src3 directory and also adds 3.txt,4.txt twice + configuration.set(DistCpConstants.CONF_LABEL_DIFF_COPY_LISTING_CLASS,SimpleCopyListing.class.getName()); + try{ + listing = + CopyListing.getDiffCopyListing(configuration, CREDENTIALS,distCpSync); Review Comment: Done. ########## hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java: ########## @@ -167,6 +169,66 @@ public void testDuplicates() { } } + @Test(timeout=10000) + public void testFlatDiffCopyListing() { + FileSystem fs = null; + try { + fs = FileSystem.get(getConf()); + List<Path> srcPaths = new ArrayList<Path>(); + srcPaths.add(new Path("/tmp/in")); + TestDistCpUtils.createFile(fs, "/tmp/in/src1/1.txt"); + TestDistCpUtils.createFile(fs, "/tmp/in/src2/1.txt"); + TestDistCpUtils.createFile(fs, "/tmp/in/src3/3.txt"); + TestDistCpUtils.createFile(fs, "/tmp/in/src3/4.txt"); + Path target = new Path("/tmp/out"); + Path listingFile = new Path("/tmp/list"); + // adding below flags useDiff & sync only to enable context.shouldUseSnapshotDiff() + final DistCpOptions options = new DistCpOptions.Builder(srcPaths, target) + .withUseDiff("snap1","snap2") + .withSyncFolder(true) + .build(); + final DistCpContext context = new DistCpContext(options); + Configuration configuration = getConf(); + configuration.set(DistCpConstants.CONF_LABEL_DIFF_COPY_LISTING_CLASS, + FlatDiffCopyListing.class.getName()); + DistCpSync distCpSync = Mockito.mock(DistCpSync.class); + // Create a dummy DiffInfo List that contains a directory + paths inside + // that directory as part of the diff. + + ArrayList<DiffInfo> diffs = new ArrayList<>(); + diffs.add( + new DiffInfo(new Path("/tmp/in/src3/"), new Path("/tmp/in/src3/"), + SnapshotDiffReport.DiffType.CREATE)); + diffs.add(new DiffInfo(new Path("/tmp/in/src3/3.txt"), + new Path("/tmp/in/src3/3.txt"), SnapshotDiffReport.DiffType.CREATE)); + diffs.add(new DiffInfo(new Path("/tmp/in/src3/4.txt"), + new Path("/tmp/in/src3/4.txt"), SnapshotDiffReport.DiffType.CREATE)); + Mockito.when(distCpSync.prepareDiffListForCopyListing()).thenReturn(diffs); + + CopyListing listing = + CopyListing.getDiffCopyListing(configuration, CREDENTIALS,distCpSync); + // won't throw DuplicateFileException as copyListing is FlatDiffCopyListing. + listing.buildListing(listingFile, context); + + // Throws DuplicateFileException when copyListing is SimpleCopyListing + // as it recursively traverses src3 directory and also adds 3.txt,4.txt twice + configuration.set(DistCpConstants.CONF_LABEL_DIFF_COPY_LISTING_CLASS,SimpleCopyListing.class.getName()); + try{ + listing = + CopyListing.getDiffCopyListing(configuration, CREDENTIALS,distCpSync); + listing.buildListing(listingFile, context); Review Comment: Done. > Support snapshot diff based copylisting for flat paths. > ------------------------------------------------------- > > Key: HDFS-17120 > URL: https://issues.apache.org/jira/browse/HDFS-17120 > Project: Hadoop HDFS > Issue Type: Bug > Reporter: Sadanand Shenoy > Assignee: Sadanand Shenoy > Priority: Major > Labels: pull-request-available > > Currently for Diff-based copyListing that is used during the distcpSync step > of an incremental copy by default the SimpleCopyListing implementation is > used. In it's implementation it iterates through the DiffReport and if the > DiffType is Create and the path is a directory, it recursively traverses the > directory and adds the subpaths to the resultant copyList. > This works fine for implementations of snapshotDiff that include only > top-level directories as part of its DiffReport . Suppose a snapshotDiff > implementation outputs only flat paths that include both the directory and > sub-directory subpath in its DiffReport, it will lead to duplicate paths in > the copyList and throws DuplicateFileException. > > For example > Ozone filesystem implementation of snapdiff b/w 2 snapshots shows all > subpaths as part of the diff. > {code:java} > [~]# ozone sh snapshot create vol11/buck1 snap1 > [~]# ozone sh snapshot create vol11/buck2 snap1 > [~]# ozone fs -mkdir ofs://ozone1/vol11/buck1/dir1 > [ ~]# ozone fs -mkdir ofs://ozone1/vol11/buck1/dir1/dir11 > [ ~]# ozone fs -mkdir ofs://ozone1/vol11/buck1/dir1/dir11/dir111 > [~]# ozone sh snapshot create vol11/buck1 snap2 > [~]# ozone sh snapshot diff vol11/buck1 snap1 snap2 > Difference between snapshot: snap1 and snapshot: snap2 > + ./dir1 > + ./dir1/dir11 > + ./dir1/dir11/dir111 {code} > we can see even though dir11 & dir111 are subpaths they are present in > snapdiff , This is not the case for HDFS though. > This Jira aims to create a new copyListing impl that is used for diff based > copyListing that doesn't traverse the directory but only adds paths that are > present in its diff. -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: hdfs-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: hdfs-issues-h...@hadoop.apache.org