diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java index 6838d4f775..b05ce10a96 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java @@ -122,6 +122,10 @@ private DistCpConstants() { /* DistCp CopyListing class override param */ public static final String CONF_LABEL_COPY_LISTING_CLASS = "distcp.copy.listing.class"; + /* Traverse directory from diff recursively and add paths to the copyList if true */ + public static final String CONF_LABEL_DIFF_COPY_LISTING_TRAVERSE_DIRECTORY = + "distcp.diff.copy.listing.traverse.directory"; + /** * DistCp Filter class override param. */ diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/GlobbedCopyListing.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/GlobbedCopyListing.java index c97f5c5277..465e7ed9fd 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/GlobbedCopyListing.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/GlobbedCopyListing.java @@ -61,7 +61,7 @@ protected void validatePaths(DistCpContext context) * @param pathToListingFile The location at which the copy-listing file * is to be created. * @param context The distcp context with associated input options. - * @throws IOException + * @throws IOException if unable to construct the fileList */ @Override public void doBuildListing(Path pathToListingFile, DistCpContext context) diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/RegexCopyFilter.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/RegexCopyFilter.java index 0ca9e632b9..f5f23eea05 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/RegexCopyFilter.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/RegexCopyFilter.java @@ -50,6 +50,7 @@ public class RegexCopyFilter extends CopyFilter { /** * Constructor, sets up a File object to read filter patterns from and * the List to store the patterns. + * @param filtersFilename name of the filtersFile */ protected RegexCopyFilter(String filtersFilename) { filtersFile = new File(filtersFilename); diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java index c3f097da4b..3cb00e0a8d 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java @@ -234,7 +234,11 @@ private Path getPathWithSchemeAndAuthority(Path path) throws IOException { /** * Write a single file/directory to the sequence file. - * @throws IOException + * @param fileListWriter the list for holding processed results + * @param sourceRoot the source dir path for copyListing + * @param path add the given path to the file list. + * @param context The DistCp context with associated input options + * @throws IOException if it fails to add it to the fileList */ private void addToFileListing(SequenceFile.Writer fileListWriter, Path sourceRoot, Path path, DistCpContext context) throws IOException { @@ -265,7 +269,7 @@ private void addToFileListing(SequenceFile.Writer fileListWriter, * into the list. * @param fileListWriter the list for holding processed results * @param context The DistCp context with associated input options - * @throws IOException + * @throws IOException if unable to construct the fileList */ @VisibleForTesting protected void doBuildListingWithSnapshotDiff( @@ -274,6 +278,8 @@ protected void doBuildListingWithSnapshotDiff( ArrayList diffList = distCpSync.prepareDiffListForCopyListing(); Path sourceRoot = context.getSourcePaths().get(0); FileSystem sourceFS = sourceRoot.getFileSystem(getConf()); + boolean traverseDirectory = getConf().getBoolean( + DistCpConstants.CONF_LABEL_DIFF_COPY_LISTING_TRAVERSE_DIRECTORY, true); try { List fileStatuses = Lists.newArrayList(); @@ -285,25 +291,8 @@ protected void doBuildListingWithSnapshotDiff( addToFileListing(fileListWriter, sourceRoot, diff.getTarget(), context); } else if (diff.getType() == SnapshotDiffReport.DiffType.CREATE) { - addToFileListing(fileListWriter, - sourceRoot, diff.getTarget(), context); - - FileStatus sourceStatus = sourceFS.getFileStatus(diff.getTarget()); - if (sourceStatus.isDirectory()) { - LOG.debug("Adding source dir for traverse: {}", - sourceStatus.getPath()); - - HashSet excludeList = - distCpSync.getTraverseExcludeList(diff.getSource(), - context.getSourcePaths().get(0)); - - ArrayList sourceDirs = new ArrayList<>(); - sourceDirs.add(sourceStatus); - - new TraverseDirectory(fileListWriter, sourceFS, sourceDirs, - sourceRoot, context, excludeList, fileStatuses) - .traverseDirectory(); - } + addCreateDiffsToFileListing(fileListWriter, context, sourceRoot, + sourceFS, fileStatuses, diff, traverseDirectory); } } if (randomizeFileListing) { @@ -316,6 +305,43 @@ protected void doBuildListingWithSnapshotDiff( } } + /** + * Handle create Diffs and add to the copyList. + * If the path is a directory, iterate it recursively and add the paths + * to the result copyList. + * + * @param fileListWriter the list for holding processed results + * @param context The DistCp context with associated input options + * @param sourceRoot The rootDir of the source snapshot + * @param sourceFS the source Filesystem + * @param fileStatuses store the result fileStatuses to add to the copyList + * @param diff the SnapshotDiff report + * @param traverseDirectory traverse directory recursively and add paths to the copyList + * @throws IOException + */ + private void addCreateDiffsToFileListing(SequenceFile.Writer fileListWriter, + DistCpContext context, Path sourceRoot, FileSystem sourceFS, + List fileStatuses, DiffInfo diff, boolean traverseDirectory) throws IOException { + addToFileListing(fileListWriter, sourceRoot, diff.getTarget(), context); + + if (traverseDirectory) { + FileStatus sourceStatus = sourceFS.getFileStatus(diff.getTarget()); + if (sourceStatus.isDirectory()) { + LOG.debug("Adding source dir for traverse: {}", sourceStatus.getPath()); + + HashSet excludeList = + distCpSync.getTraverseExcludeList(diff.getSource(), + context.getSourcePaths().get(0)); + + ArrayList sourceDirs = new ArrayList<>(); + sourceDirs.add(sourceStatus); + + new TraverseDirectory(fileListWriter, sourceFS, sourceDirs, sourceRoot, + context, excludeList, fileStatuses).traverseDirectory(); + } + } + } + /** * Collect the list of * {@literal } @@ -332,7 +358,7 @@ protected void doBuildListingWithSnapshotDiff( * computed. * @param fileListWriter * @param context The distcp context with associated input options - * @throws IOException + * @throws IOException if unable to construct the fileList */ @VisibleForTesting protected void doBuildListing(SequenceFile.Writer fileListWriter, diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java index 69e1421f08..317bdcb149 100644 --- a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java @@ -20,6 +20,8 @@ import static org.mockito.Mockito.*; +import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport; +import org.mockito.Mockito; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.fs.Path; @@ -167,6 +169,72 @@ public void testDuplicates() { } } + @Test(expected = DuplicateFileException.class, timeout = 10000) + public void testDiffBasedSimpleCopyListing() throws IOException { + FileSystem fs = null; + Configuration configuration = getConf(); + DistCpSync distCpSync = Mockito.mock(DistCpSync.class); + Path listingFile = new Path("/tmp/list"); + // Throws DuplicateFileException as it recursively traverses src3 directory + // and also adds 3.txt,4.txt twice + configuration.setBoolean( + DistCpConstants.CONF_LABEL_DIFF_COPY_LISTING_TRAVERSE_DIRECTORY, true); + try { + fs = FileSystem.get(getConf()); + buildListingUsingSnapshotDiff(fs, configuration, distCpSync, listingFile); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test(timeout=10000) + public void testDiffBasedSimpleCopyListingWithoutTraverseDirectory() + throws IOException { + FileSystem fs = null; + Configuration configuration = getConf(); + DistCpSync distCpSync = Mockito.mock(DistCpSync.class); + Path listingFile = new Path("/tmp/list"); + // no exception expected in this case + configuration.setBoolean( + DistCpConstants.CONF_LABEL_DIFF_COPY_LISTING_TRAVERSE_DIRECTORY, false); + try { + fs = FileSystem.get(getConf()); + buildListingUsingSnapshotDiff(fs, configuration, distCpSync, listingFile); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + private void buildListingUsingSnapshotDiff(FileSystem fs, + Configuration configuration, DistCpSync distCpSync, Path listingFile) + throws IOException { + List srcPaths = new ArrayList<>(); + srcPaths.add(new Path("/tmp/in")); + TestDistCpUtils.createFile(fs, "/tmp/in/src1/1.txt"); + TestDistCpUtils.createFile(fs, "/tmp/in/src2/1.txt"); + TestDistCpUtils.createFile(fs, "/tmp/in/src3/3.txt"); + TestDistCpUtils.createFile(fs, "/tmp/in/src3/4.txt"); + Path target = new Path("/tmp/out"); + // adding below flags useDiff & sync only to enable context.shouldUseSnapshotDiff() + final DistCpOptions options = + new DistCpOptions.Builder(srcPaths, target).withUseDiff("snap1", + "snap2").withSyncFolder(true).build(); + // Create a dummy DiffInfo List that contains a directory + paths inside + // that directory as part of the diff. + ArrayList diffs = new ArrayList<>(); + diffs.add(new DiffInfo(new Path("/tmp/in/src3/"), new Path("/tmp/in/src3/"), + SnapshotDiffReport.DiffType.CREATE)); + diffs.add(new DiffInfo(new Path("/tmp/in/src3/3.txt"), + new Path("/tmp/in/src3/3.txt"), SnapshotDiffReport.DiffType.CREATE)); + diffs.add(new DiffInfo(new Path("/tmp/in/src3/4.txt"), + new Path("/tmp/in/src3/4.txt"), SnapshotDiffReport.DiffType.CREATE)); + Mockito.when(distCpSync.prepareDiffListForCopyListing()).thenReturn(diffs); + final DistCpContext context = new DistCpContext(options); + CopyListing listing = + new SimpleCopyListing(configuration, CREDENTIALS, distCpSync); + listing.buildListing(listingFile, context); + } + @Test public void testDuplicateSourcePaths() throws Exception { FileSystem fs = FileSystem.get(getConf());