HDFS-17120. Support snapshot diff based copylisting for flat paths. (#5885)

This commit is contained in:
Sadanand Shenoy 2023-07-27 13:23:57 +05:30 committed by GitHub
parent b1fc00d4b2
commit b971222372
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 122 additions and 23 deletions

View File

@ -122,6 +122,10 @@ public final class DistCpConstants {
/* DistCp CopyListing class override param */
public static final String CONF_LABEL_COPY_LISTING_CLASS = "distcp.copy.listing.class";
/* Traverse directory from diff recursively and add paths to the copyList if true */
public static final String CONF_LABEL_DIFF_COPY_LISTING_TRAVERSE_DIRECTORY =
"distcp.diff.copy.listing.traverse.directory";
/**
* DistCp Filter class override param.
*/

View File

@ -61,7 +61,7 @@ public class GlobbedCopyListing extends CopyListing {
* @param pathToListingFile The location at which the copy-listing file
* is to be created.
* @param context The distcp context with associated input options.
* @throws IOException
* @throws IOException if unable to construct the fileList
*/
@Override
public void doBuildListing(Path pathToListingFile, DistCpContext context)

View File

@ -50,6 +50,7 @@ public class RegexCopyFilter extends CopyFilter {
/**
* Constructor, sets up a File object to read filter patterns from and
* the List to store the patterns.
* @param filtersFilename name of the filtersFile
*/
protected RegexCopyFilter(String filtersFilename) {
filtersFile = new File(filtersFilename);

View File

@ -234,7 +234,11 @@ public class SimpleCopyListing extends CopyListing {
/**
* Write a single file/directory to the sequence file.
* @throws IOException
* @param fileListWriter the list for holding processed results
* @param sourceRoot the source dir path for copyListing
* @param path add the given path to the file list.
* @param context The DistCp context with associated input options
* @throws IOException if it fails to add it to the fileList
*/
private void addToFileListing(SequenceFile.Writer fileListWriter,
Path sourceRoot, Path path, DistCpContext context) throws IOException {
@ -265,7 +269,7 @@ public class SimpleCopyListing extends CopyListing {
* into the list.
* @param fileListWriter the list for holding processed results
* @param context The DistCp context with associated input options
* @throws IOException
* @throws IOException if unable to construct the fileList
*/
@VisibleForTesting
protected void doBuildListingWithSnapshotDiff(
@ -274,6 +278,8 @@ public class SimpleCopyListing extends CopyListing {
ArrayList<DiffInfo> diffList = distCpSync.prepareDiffListForCopyListing();
Path sourceRoot = context.getSourcePaths().get(0);
FileSystem sourceFS = sourceRoot.getFileSystem(getConf());
boolean traverseDirectory = getConf().getBoolean(
DistCpConstants.CONF_LABEL_DIFF_COPY_LISTING_TRAVERSE_DIRECTORY, true);
try {
List<FileStatusInfo> fileStatuses = Lists.newArrayList();
@ -285,25 +291,8 @@ public class SimpleCopyListing extends CopyListing {
addToFileListing(fileListWriter,
sourceRoot, diff.getTarget(), context);
} else if (diff.getType() == SnapshotDiffReport.DiffType.CREATE) {
addToFileListing(fileListWriter,
sourceRoot, diff.getTarget(), context);
FileStatus sourceStatus = sourceFS.getFileStatus(diff.getTarget());
if (sourceStatus.isDirectory()) {
LOG.debug("Adding source dir for traverse: {}",
sourceStatus.getPath());
HashSet<String> excludeList =
distCpSync.getTraverseExcludeList(diff.getSource(),
context.getSourcePaths().get(0));
ArrayList<FileStatus> sourceDirs = new ArrayList<>();
sourceDirs.add(sourceStatus);
new TraverseDirectory(fileListWriter, sourceFS, sourceDirs,
sourceRoot, context, excludeList, fileStatuses)
.traverseDirectory();
}
addCreateDiffsToFileListing(fileListWriter, context, sourceRoot,
sourceFS, fileStatuses, diff, traverseDirectory);
}
}
if (randomizeFileListing) {
@ -316,6 +305,43 @@ public class SimpleCopyListing extends CopyListing {
}
}
/**
* Handle create Diffs and add to the copyList.
* If the path is a directory, iterate it recursively and add the paths
* to the result copyList.
*
* @param fileListWriter the list for holding processed results
* @param context The DistCp context with associated input options
* @param sourceRoot The rootDir of the source snapshot
* @param sourceFS the source Filesystem
* @param fileStatuses store the result fileStatuses to add to the copyList
* @param diff the SnapshotDiff report
* @param traverseDirectory traverse directory recursively and add paths to the copyList
* @throws IOException
*/
private void addCreateDiffsToFileListing(SequenceFile.Writer fileListWriter,
DistCpContext context, Path sourceRoot, FileSystem sourceFS,
List<FileStatusInfo> fileStatuses, DiffInfo diff, boolean traverseDirectory) throws IOException {
addToFileListing(fileListWriter, sourceRoot, diff.getTarget(), context);
if (traverseDirectory) {
FileStatus sourceStatus = sourceFS.getFileStatus(diff.getTarget());
if (sourceStatus.isDirectory()) {
LOG.debug("Adding source dir for traverse: {}", sourceStatus.getPath());
HashSet<String> excludeList =
distCpSync.getTraverseExcludeList(diff.getSource(),
context.getSourcePaths().get(0));
ArrayList<FileStatus> sourceDirs = new ArrayList<>();
sourceDirs.add(sourceStatus);
new TraverseDirectory(fileListWriter, sourceFS, sourceDirs, sourceRoot,
context, excludeList, fileStatuses).traverseDirectory();
}
}
}
/**
* Collect the list of
* {@literal <sourceRelativePath, sourceFileStatus>}
@ -332,7 +358,7 @@ public class SimpleCopyListing extends CopyListing {
* computed.
* @param fileListWriter
* @param context The distcp context with associated input options
* @throws IOException
* @throws IOException if unable to construct the fileList
*/
@VisibleForTesting
protected void doBuildListing(SequenceFile.Writer fileListWriter,

View File

@ -20,6 +20,8 @@ package org.apache.hadoop.tools;
import static org.mockito.Mockito.*;
import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
import org.mockito.Mockito;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.Path;
@ -167,6 +169,72 @@ public class TestCopyListing extends SimpleCopyListing {
}
}
@Test(expected = DuplicateFileException.class, timeout = 10000)
public void testDiffBasedSimpleCopyListing() throws IOException {
FileSystem fs = null;
Configuration configuration = getConf();
DistCpSync distCpSync = Mockito.mock(DistCpSync.class);
Path listingFile = new Path("/tmp/list");
// Throws DuplicateFileException as it recursively traverses src3 directory
// and also adds 3.txt,4.txt twice
configuration.setBoolean(
DistCpConstants.CONF_LABEL_DIFF_COPY_LISTING_TRAVERSE_DIRECTORY, true);
try {
fs = FileSystem.get(getConf());
buildListingUsingSnapshotDiff(fs, configuration, distCpSync, listingFile);
} finally {
TestDistCpUtils.delete(fs, "/tmp");
}
}
@Test(timeout=10000)
public void testDiffBasedSimpleCopyListingWithoutTraverseDirectory()
throws IOException {
FileSystem fs = null;
Configuration configuration = getConf();
DistCpSync distCpSync = Mockito.mock(DistCpSync.class);
Path listingFile = new Path("/tmp/list");
// no exception expected in this case
configuration.setBoolean(
DistCpConstants.CONF_LABEL_DIFF_COPY_LISTING_TRAVERSE_DIRECTORY, false);
try {
fs = FileSystem.get(getConf());
buildListingUsingSnapshotDiff(fs, configuration, distCpSync, listingFile);
} finally {
TestDistCpUtils.delete(fs, "/tmp");
}
}
private void buildListingUsingSnapshotDiff(FileSystem fs,
Configuration configuration, DistCpSync distCpSync, Path listingFile)
throws IOException {
List<Path> srcPaths = new ArrayList<>();
srcPaths.add(new Path("/tmp/in"));
TestDistCpUtils.createFile(fs, "/tmp/in/src1/1.txt");
TestDistCpUtils.createFile(fs, "/tmp/in/src2/1.txt");
TestDistCpUtils.createFile(fs, "/tmp/in/src3/3.txt");
TestDistCpUtils.createFile(fs, "/tmp/in/src3/4.txt");
Path target = new Path("/tmp/out");
// adding below flags useDiff & sync only to enable context.shouldUseSnapshotDiff()
final DistCpOptions options =
new DistCpOptions.Builder(srcPaths, target).withUseDiff("snap1",
"snap2").withSyncFolder(true).build();
// Create a dummy DiffInfo List that contains a directory + paths inside
// that directory as part of the diff.
ArrayList<DiffInfo> diffs = new ArrayList<>();
diffs.add(new DiffInfo(new Path("/tmp/in/src3/"), new Path("/tmp/in/src3/"),
SnapshotDiffReport.DiffType.CREATE));
diffs.add(new DiffInfo(new Path("/tmp/in/src3/3.txt"),
new Path("/tmp/in/src3/3.txt"), SnapshotDiffReport.DiffType.CREATE));
diffs.add(new DiffInfo(new Path("/tmp/in/src3/4.txt"),
new Path("/tmp/in/src3/4.txt"), SnapshotDiffReport.DiffType.CREATE));
Mockito.when(distCpSync.prepareDiffListForCopyListing()).thenReturn(diffs);
final DistCpContext context = new DistCpContext(options);
CopyListing listing =
new SimpleCopyListing(configuration, CREDENTIALS, distCpSync);
listing.buildListing(listingFile, context);
}
@Test
public void testDuplicateSourcePaths() throws Exception {
FileSystem fs = FileSystem.get(getConf());