HDFS-8828. Utilize Snapshot diff report to build diff copy list in distcp. (Yufei Gu via Yongjun Zhang)

This commit is contained in:
Yongjun Zhang 2015-08-20 08:02:54 -07:00
parent 5e8fe89437
commit 0bc15cb6e6
9 changed files with 786 additions and 121 deletions

View File

@ -822,6 +822,9 @@ Release 2.8.0 - UNRELEASED
HDFS-8884. Fail-fast check in BlockPlacementPolicyDefault#chooseTarget. HDFS-8884. Fail-fast check in BlockPlacementPolicyDefault#chooseTarget.
(yliu) (yliu)
HDFS-8828. Utilize Snapshot diff report to build diff copy list in distcp.
(Yufei Gu via Yongjun Zhang)
OPTIMIZATIONS OPTIMIZATIONS
HDFS-8026. Trace FSOutputSummer#writeChecksumChunks rather than HDFS-8026. Trace FSOutputSummer#writeChecksumChunks rather than

View File

@ -27,6 +27,8 @@
import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.tools.util.DistCpUtils; import org.apache.hadoop.tools.util.DistCpUtils;
import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.Credentials;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.IOException; import java.io.IOException;
import java.lang.reflect.Constructor; import java.lang.reflect.Constructor;
@ -46,7 +48,7 @@
public abstract class CopyListing extends Configured { public abstract class CopyListing extends Configured {
private Credentials credentials; private Credentials credentials;
static final Log LOG = LogFactory.getLog(DistCp.class);
/** /**
* Build listing function creates the input listing that distcp uses to * Build listing function creates the input listing that distcp uses to
* perform the copy. * perform the copy.
@ -89,6 +91,7 @@ public final void buildListing(Path pathToListFile,
config.setLong(DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS, getNumberOfPaths()); config.setLong(DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS, getNumberOfPaths());
validateFinalListing(pathToListFile, options); validateFinalListing(pathToListFile, options);
LOG.info("Number of paths in the copy list: " + this.getNumberOfPaths());
} }
/** /**
@ -153,6 +156,7 @@ private void validateFinalListing(Path pathToListFile, DistCpOptions options)
Text currentKey = new Text(); Text currentKey = new Text();
Set<URI> aclSupportCheckFsSet = Sets.newHashSet(); Set<URI> aclSupportCheckFsSet = Sets.newHashSet();
Set<URI> xAttrSupportCheckFsSet = Sets.newHashSet(); Set<URI> xAttrSupportCheckFsSet = Sets.newHashSet();
long idx = 0;
while (reader.next(currentKey)) { while (reader.next(currentKey)) {
if (currentKey.equals(lastKey)) { if (currentKey.equals(lastKey)) {
CopyListingFileStatus currentFileStatus = new CopyListingFileStatus(); CopyListingFileStatus currentFileStatus = new CopyListingFileStatus();
@ -178,6 +182,12 @@ private void validateFinalListing(Path pathToListFile, DistCpOptions options)
} }
} }
lastKey.set(currentKey); lastKey.set(currentKey);
if (options.shouldUseDiff() && LOG.isDebugEnabled()) {
LOG.debug("Copy list entry " + idx + ": " +
lastFileStatus.getPath().toUri().getPath());
idx++;
}
} }
} finally { } finally {
IOUtils.closeStream(reader); IOUtils.closeStream(reader);
@ -224,9 +234,6 @@ public static CopyListing getCopyListing(Configuration configuration,
Credentials credentials, Credentials credentials,
DistCpOptions options) DistCpOptions options)
throws IOException { throws IOException {
if (options.shouldUseDiff()) {
return new GlobbedCopyListing(configuration, credentials);
}
String copyListingClassName = configuration.get(DistCpConstants. String copyListingClassName = configuration.get(DistCpConstants.
CONF_LABEL_COPY_LISTING_CLASS, ""); CONF_LABEL_COPY_LISTING_CLASS, "");
Class<? extends CopyListing> copyListingClass; Class<? extends CopyListing> copyListingClass;

View File

@ -17,12 +17,9 @@
*/ */
package org.apache.hadoop.tools; package org.apache.hadoop.tools;
import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.List;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport; import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
/** /**
@ -54,12 +51,19 @@ public int compare(DiffInfo d1, DiffInfo d2) {
*/ */
private Path tmp; private Path tmp;
/** The target file/dir of the rename op. Null means the op is deletion. */ /** The target file/dir of the rename op. Null means the op is deletion. */
final Path target; Path target;
DiffInfo(Path source, Path target) { private final SnapshotDiffReport.DiffType type;
public SnapshotDiffReport.DiffType getType(){
return this.type;
}
DiffInfo(Path source, Path target, SnapshotDiffReport.DiffType type) {
assert source != null; assert source != null;
this.source = source; this.source = source;
this.target= target; this.target= target;
this.type = type;
} }
void setTmp(Path tmp) { void setTmp(Path tmp) {
@ -69,22 +73,4 @@ void setTmp(Path tmp) {
Path getTmp() { Path getTmp() {
return tmp; return tmp;
} }
static DiffInfo[] getDiffs(SnapshotDiffReport report, Path targetDir) {
List<DiffInfo> diffs = new ArrayList<>();
for (SnapshotDiffReport.DiffReportEntry entry : report.getDiffList()) {
if (entry.getType() == SnapshotDiffReport.DiffType.DELETE) {
final Path source = new Path(targetDir,
DFSUtil.bytes2String(entry.getSourcePath()));
diffs.add(new DiffInfo(source, null));
} else if (entry.getType() == SnapshotDiffReport.DiffType.RENAME) {
final Path source = new Path(targetDir,
DFSUtil.bytes2String(entry.getSourcePath()));
final Path target = new Path(targetDir,
DFSUtil.bytes2String(entry.getTargetPath()));
diffs.add(new DiffInfo(source, target));
}
}
return diffs.toArray(new DiffInfo[diffs.size()]);
}
} }

View File

@ -175,11 +175,18 @@ public Job createAndSubmitJob() throws Exception {
job = createJob(); job = createJob();
} }
if (inputOptions.shouldUseDiff()) { if (inputOptions.shouldUseDiff()) {
if (!DistCpSync.sync(inputOptions, getConf())) { DistCpSync distCpSync = new DistCpSync(inputOptions, getConf());
if (distCpSync.sync()) {
createInputFileListingWithDiff(job, distCpSync);
} else {
inputOptions.disableUsingDiff(); inputOptions.disableUsingDiff();
} }
} }
createInputFileListing(job);
// Fallback to default DistCp if without "diff" option or sync failed.
if (!inputOptions.shouldUseDiff()) {
createInputFileListing(job);
}
job.submit(); job.submit();
submitted = true; submitted = true;
@ -384,6 +391,22 @@ protected Path createInputFileListing(Job job) throws IOException {
return fileListingPath; return fileListingPath;
} }
/**
* Create input listing based on snapshot diff report.
* @param job - Handle to job
* @param distCpSync the class wraps the snapshot diff report
* @return Returns the path where the copy listing is created
* @throws IOException - If any
*/
private Path createInputFileListingWithDiff(Job job, DistCpSync distCpSync)
throws IOException {
Path fileListingPath = getFileListingPath();
CopyListing copyListing = new SimpleCopyListing(job.getConfiguration(),
job.getCredentials(), distCpSync);
copyListing.buildListing(fileListingPath, inputOptions);
return fileListingPath;
}
/** /**
* Get default name of the copy listing file. Use the meta folder * Get default name of the copy listing file. Use the meta folder
* to create the copy listing file * to create the copy listing file

View File

@ -614,9 +614,9 @@ public void validate(DistCpOptionSwitch option, boolean value) {
throw new IllegalArgumentException( throw new IllegalArgumentException(
"Append is disallowed when skipping CRC"); "Append is disallowed when skipping CRC");
} }
if ((!syncFolder || !deleteMissing) && useDiff) { if ((!syncFolder || deleteMissing) && useDiff) {
throw new IllegalArgumentException( throw new IllegalArgumentException(
"Diff is valid only with update and delete options"); "Diff is valid only with update options");
} }
} }

View File

@ -17,10 +17,10 @@
*/ */
package org.apache.hadoop.tools; package org.apache.hadoop.tools;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport; import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
@ -29,6 +29,9 @@
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Random; import java.util.Random;
import java.util.EnumMap;
import java.util.ArrayList;
import java.util.HashSet;
/** /**
* This class provides the basic functionality to sync two FileSystems based on * This class provides the basic functionality to sync two FileSystems based on
@ -41,9 +44,26 @@
* source.s1 * source.s1
*/ */
class DistCpSync { class DistCpSync {
private DistCpOptions inputOptions;
private Configuration conf;
private EnumMap<SnapshotDiffReport.DiffType, List<DiffInfo>> diffMap;
private DiffInfo[] renameDiffs;
static boolean sync(DistCpOptions inputOptions, Configuration conf) DistCpSync(DistCpOptions options, Configuration conf) {
throws IOException { this.inputOptions = options;
this.conf = conf;
}
/**
* Check if three conditions are met before sync.
* 1. Only one source directory.
* 2. Both source and target file system are DFS.
* 3. There is no change between from and the current status in target
* file system.
* Throw exceptions if first two aren't met, and return false to fallback to
* default distcp if the third condition isn't met.
*/
private boolean preSyncCheck() throws IOException {
List<Path> sourcePaths = inputOptions.getSourcePaths(); List<Path> sourcePaths = inputOptions.getSourcePaths();
if (sourcePaths.size() != 1) { if (sourcePaths.size() != 1) {
// we only support one source dir which must be a snapshottable directory // we only support one source dir which must be a snapshottable directory
@ -62,26 +82,41 @@ static boolean sync(DistCpOptions inputOptions, Configuration conf)
throw new IllegalArgumentException("The FileSystems needs to" + throw new IllegalArgumentException("The FileSystems needs to" +
" be DistributedFileSystem for using snapshot-diff-based distcp"); " be DistributedFileSystem for using snapshot-diff-based distcp");
} }
final DistributedFileSystem sourceFs = (DistributedFileSystem) sfs; final DistributedFileSystem targetFs = (DistributedFileSystem) tfs;
final DistributedFileSystem targetFs= (DistributedFileSystem) tfs;
// make sure targetFS has no change between from and the current states // make sure targetFS has no change between from and the current states
if (!checkNoChange(inputOptions, targetFs, targetDir)) { if (!checkNoChange(targetFs, targetDir)) {
// set the source path using the snapshot path // set the source path using the snapshot path
inputOptions.setSourcePaths(Arrays.asList(getSourceSnapshotPath(sourceDir, inputOptions.setSourcePaths(Arrays.asList(getSourceSnapshotPath(sourceDir,
inputOptions.getToSnapshot()))); inputOptions.getToSnapshot())));
return false; return false;
} }
return true;
}
public boolean sync() throws IOException {
if (!preSyncCheck()) {
return false;
}
if (!getAllDiffs()) {
return false;
}
List<Path> sourcePaths = inputOptions.getSourcePaths();
final Path sourceDir = sourcePaths.get(0);
final Path targetDir = inputOptions.getTargetPath();
final FileSystem tfs = targetDir.getFileSystem(conf);
final DistributedFileSystem targetFs = (DistributedFileSystem) tfs;
Path tmpDir = null; Path tmpDir = null;
try { try {
tmpDir = createTargetTmpDir(targetFs, targetDir); tmpDir = createTargetTmpDir(targetFs, targetDir);
DiffInfo[] diffs = getDiffs(inputOptions, sourceFs, sourceDir, targetDir); DiffInfo[] renameAndDeleteDiffs = getRenameAndDeleteDiffs(targetDir);
if (diffs == null) { if (renameAndDeleteDiffs.length > 0) {
return false; // do the real sync work: deletion and rename
syncDiff(renameAndDeleteDiffs, targetFs, tmpDir);
} }
// do the real sync work: deletion and rename
syncDiff(diffs, targetFs, tmpDir);
return true; return true;
} catch (Exception e) { } catch (Exception e) {
DistCp.LOG.warn("Failed to use snapshot diff for distcp", e); DistCp.LOG.warn("Failed to use snapshot diff for distcp", e);
@ -95,11 +130,64 @@ static boolean sync(DistCpOptions inputOptions, Configuration conf)
} }
} }
private static String getSnapshotName(String name) { /**
* Get all diffs from source directory snapshot diff report, put them into an
* EnumMap whose key is DiffType, and value is a DiffInfo list. If there is
* no entry for a given DiffType, the associated value will be an empty list.
*/
private boolean getAllDiffs() throws IOException {
List<Path> sourcePaths = inputOptions.getSourcePaths();
final Path sourceDir = sourcePaths.get(0);
try {
DistributedFileSystem fs =
(DistributedFileSystem) sourceDir.getFileSystem(conf);
final String from = getSnapshotName(inputOptions.getFromSnapshot());
final String to = getSnapshotName(inputOptions.getToSnapshot());
SnapshotDiffReport report = fs.getSnapshotDiffReport(sourceDir,
from, to);
this.diffMap = new EnumMap<>(SnapshotDiffReport.DiffType.class);
for (SnapshotDiffReport.DiffType type :
SnapshotDiffReport.DiffType.values()) {
diffMap.put(type, new ArrayList<DiffInfo>());
}
for (SnapshotDiffReport.DiffReportEntry entry : report.getDiffList()) {
// If the entry is the snapshot root, usually a item like "M\t."
// in the diff report. We don't need to handle it and cannot handle it,
// since its sourcepath is empty.
if (entry.getSourcePath().length <= 0) {
continue;
}
List<DiffInfo> list = diffMap.get(entry.getType());
if (entry.getType() == SnapshotDiffReport.DiffType.MODIFY ||
entry.getType() == SnapshotDiffReport.DiffType.CREATE ||
entry.getType() == SnapshotDiffReport.DiffType.DELETE) {
final Path source =
new Path(DFSUtil.bytes2String(entry.getSourcePath()));
list.add(new DiffInfo(source, null, entry.getType()));
} else if (entry.getType() == SnapshotDiffReport.DiffType.RENAME) {
final Path source =
new Path(DFSUtil.bytes2String(entry.getSourcePath()));
final Path target =
new Path(DFSUtil.bytes2String(entry.getTargetPath()));
list.add(new DiffInfo(source, target, entry.getType()));
}
}
return true;
} catch (IOException e) {
DistCp.LOG.warn("Failed to compute snapshot diff on " + sourceDir, e);
}
this.diffMap = null;
return false;
}
private String getSnapshotName(String name) {
return Path.CUR_DIR.equals(name) ? "" : name; return Path.CUR_DIR.equals(name) ? "" : name;
} }
private static Path getSourceSnapshotPath(Path sourceDir, String snapshotName) { private Path getSourceSnapshotPath(Path sourceDir, String snapshotName) {
if (Path.CUR_DIR.equals(snapshotName)) { if (Path.CUR_DIR.equals(snapshotName)) {
return sourceDir; return sourceDir;
} else { } else {
@ -108,8 +196,8 @@ private static Path getSourceSnapshotPath(Path sourceDir, String snapshotName) {
} }
} }
private static Path createTargetTmpDir(DistributedFileSystem targetFs, private Path createTargetTmpDir(DistributedFileSystem targetFs,
Path targetDir) throws IOException { Path targetDir) throws IOException {
final Path tmp = new Path(targetDir, final Path tmp = new Path(targetDir,
DistCpConstants.HDFS_DISTCP_DIFF_DIRECTORY_NAME + DistCp.rand.nextInt()); DistCpConstants.HDFS_DISTCP_DIFF_DIRECTORY_NAME + DistCp.rand.nextInt());
if (!targetFs.mkdirs(tmp)) { if (!targetFs.mkdirs(tmp)) {
@ -118,8 +206,8 @@ private static Path createTargetTmpDir(DistributedFileSystem targetFs,
return tmp; return tmp;
} }
private static void deleteTargetTmpDir(DistributedFileSystem targetFs, private void deleteTargetTmpDir(DistributedFileSystem targetFs,
Path tmpDir) { Path tmpDir) {
try { try {
if (tmpDir != null) { if (tmpDir != null) {
targetFs.delete(tmpDir, true); targetFs.delete(tmpDir, true);
@ -133,8 +221,7 @@ private static void deleteTargetTmpDir(DistributedFileSystem targetFs,
* Compute the snapshot diff on the given file system. Return true if the diff * Compute the snapshot diff on the given file system. Return true if the diff
* is empty, i.e., no changes have happened in the FS. * is empty, i.e., no changes have happened in the FS.
*/ */
private static boolean checkNoChange(DistCpOptions inputOptions, private boolean checkNoChange(DistributedFileSystem fs, Path path) {
DistributedFileSystem fs, Path path) {
try { try {
SnapshotDiffReport targetDiff = SnapshotDiffReport targetDiff =
fs.getSnapshotDiffReport(path, inputOptions.getFromSnapshot(), ""); fs.getSnapshotDiffReport(path, inputOptions.getFromSnapshot(), "");
@ -151,22 +238,7 @@ private static boolean checkNoChange(DistCpOptions inputOptions,
return false; return false;
} }
@VisibleForTesting private void syncDiff(DiffInfo[] diffs,
static DiffInfo[] getDiffs(DistCpOptions inputOptions,
DistributedFileSystem fs, Path sourceDir, Path targetDir) {
try {
final String from = getSnapshotName(inputOptions.getFromSnapshot());
final String to = getSnapshotName(inputOptions.getToSnapshot());
SnapshotDiffReport sourceDiff = fs.getSnapshotDiffReport(sourceDir,
from, to);
return DiffInfo.getDiffs(sourceDiff, targetDir);
} catch (IOException e) {
DistCp.LOG.warn("Failed to compute snapshot diff on " + sourceDir, e);
}
return null;
}
private static void syncDiff(DiffInfo[] diffs,
DistributedFileSystem targetFs, Path tmpDir) throws IOException { DistributedFileSystem targetFs, Path tmpDir) throws IOException {
moveToTmpDir(diffs, targetFs, tmpDir); moveToTmpDir(diffs, targetFs, tmpDir);
moveToTarget(diffs, targetFs); moveToTarget(diffs, targetFs);
@ -176,7 +248,7 @@ private static void syncDiff(DiffInfo[] diffs,
* Move all the source files that should be renamed or deleted to the tmp * Move all the source files that should be renamed or deleted to the tmp
* directory. * directory.
*/ */
private static void moveToTmpDir(DiffInfo[] diffs, private void moveToTmpDir(DiffInfo[] diffs,
DistributedFileSystem targetFs, Path tmpDir) throws IOException { DistributedFileSystem targetFs, Path tmpDir) throws IOException {
// sort the diffs based on their source paths to make sure the files and // sort the diffs based on their source paths to make sure the files and
// subdirs are moved before moving their parents/ancestors. // subdirs are moved before moving their parents/ancestors.
@ -196,7 +268,7 @@ private static void moveToTmpDir(DiffInfo[] diffs,
* Finish the rename operations: move all the intermediate files/directories * Finish the rename operations: move all the intermediate files/directories
* from the tmp dir to the final targets. * from the tmp dir to the final targets.
*/ */
private static void moveToTarget(DiffInfo[] diffs, private void moveToTarget(DiffInfo[] diffs,
DistributedFileSystem targetFs) throws IOException { DistributedFileSystem targetFs) throws IOException {
// sort the diffs based on their target paths to make sure the parent // sort the diffs based on their target paths to make sure the parent
// directories are created first. // directories are created first.
@ -210,4 +282,166 @@ private static void moveToTarget(DiffInfo[] diffs,
} }
} }
} }
/**
* Get rename and delete diffs and add the targetDir as the prefix of their
* source and target paths.
*/
private DiffInfo[] getRenameAndDeleteDiffs(Path targetDir) {
List<DiffInfo> renameAndDeleteDiff = new ArrayList<>();
for (DiffInfo diff : diffMap.get(SnapshotDiffReport.DiffType.DELETE)) {
Path source = new Path(targetDir, diff.source);
renameAndDeleteDiff.add(new DiffInfo(source, diff.target,
diff.getType()));
}
for (DiffInfo diff : diffMap.get(SnapshotDiffReport.DiffType.RENAME)) {
Path source = new Path(targetDir, diff.source);
Path target = new Path(targetDir, diff.target);
renameAndDeleteDiff.add(new DiffInfo(source, target, diff.getType()));
}
return renameAndDeleteDiff.toArray(
new DiffInfo[renameAndDeleteDiff.size()]);
}
private DiffInfo[] getCreateAndModifyDiffs() {
List<DiffInfo> createDiff =
diffMap.get(SnapshotDiffReport.DiffType.CREATE);
List<DiffInfo> modifyDiff =
diffMap.get(SnapshotDiffReport.DiffType.MODIFY);
List<DiffInfo> diffs =
new ArrayList<>(createDiff.size() + modifyDiff.size());
diffs.addAll(createDiff);
diffs.addAll(modifyDiff);
return diffs.toArray(new DiffInfo[diffs.size()]);
}
/**
* Probe for a path being a parent of another.
* @return true if the parent's path matches the start of the child's
*/
private boolean isParentOf(Path parent, Path child) {
String parentPath = parent.toString();
String childPath = child.toString();
if (!parentPath.endsWith(Path.SEPARATOR)) {
parentPath += Path.SEPARATOR;
}
return childPath.length() > parentPath.length() &&
childPath.startsWith(parentPath);
}
/**
* Find the possible rename item which equals to the parent or self of
* a created/modified file/directory.
* @param diff a modify/create diff item
* @param renameDiffArray all rename diffs
* @return possible rename item
*/
private DiffInfo getRenameItem(DiffInfo diff, DiffInfo[] renameDiffArray) {
for (DiffInfo renameItem : renameDiffArray) {
if (diff.source.equals(renameItem.source)) {
// The same path string may appear in:
// 1. both renamed and modified snapshot diff entries.
// 2. both renamed and created snapshot diff entries.
// Case 1 is the about same file/directory, whereas case 2
// is about two different files/directories.
// We are finding case 1 here, thus we check against DiffType.MODIFY.
if (diff.getType() == SnapshotDiffReport.DiffType.MODIFY) {
return renameItem;
}
} else if (isParentOf(renameItem.source, diff.source)) {
// If rename entry is the parent of diff entry, then both MODIFY and
// CREATE diff entries should be handled.
return renameItem;
}
}
return null;
}
/**
* For a given source path, get its target path based on the rename item.
* @return target path
*/
private Path getTargetPath(Path sourcePath, DiffInfo renameItem) {
if (sourcePath.equals(renameItem.source)) {
return renameItem.target;
}
StringBuffer sb = new StringBuffer(sourcePath.toString());
String remain = sb.substring(renameItem.source.toString().length() + 1);
return new Path(renameItem.target, remain);
}
/**
* Prepare the diff list.
* This diff list only includes created or modified files/directories, since
* delete and rename items are synchronized already.
*
* If the parent or self of a source path is renamed, we need to change its
* target path according the correspondent rename item.
* @return a diff list
*/
public ArrayList<DiffInfo> prepareDiffList() {
DiffInfo[] modifyAndCreateDiffs = getCreateAndModifyDiffs();
List<DiffInfo> renameDiffsList =
diffMap.get(SnapshotDiffReport.DiffType.RENAME);
DiffInfo[] renameDiffArray =
renameDiffsList.toArray(new DiffInfo[renameDiffsList.size()]);
Arrays.sort(renameDiffArray, DiffInfo.sourceComparator);
ArrayList<DiffInfo> finalListWithTarget = new ArrayList<>();
for (DiffInfo diff : modifyAndCreateDiffs) {
DiffInfo renameItem = getRenameItem(diff, renameDiffArray);
if (renameItem == null) {
diff.target = diff.source;
} else {
diff.target = getTargetPath(diff.source, renameItem);
}
finalListWithTarget.add(diff);
}
return finalListWithTarget;
}
/**
* This method returns a list of items to be excluded when recursively
* traversing newDir to build the copy list.
*
* Specifically, given a newly created directory newDir (a CREATE entry in
* the snapshot diff), if a previously copied file/directory itemX is moved
* (a RENAME entry in the snapshot diff) into newDir, itemX should be
* excluded when recursively traversing newDir in caller method so that it
* will not to be copied again.
* If the same itemX also has a MODIFY entry in the snapshot diff report,
* meaning it was modified after it was previously copied, it will still
* be added to the copy list in caller method.
* @return the exclude list
*/
public HashSet<String> getTraverseExcludeList(Path newDir, Path prefix) {
if (renameDiffs == null) {
List<DiffInfo> renameList =
diffMap.get(SnapshotDiffReport.DiffType.RENAME);
renameDiffs = renameList.toArray(new DiffInfo[renameList.size()]);
Arrays.sort(renameDiffs, DiffInfo.targetComparator);
}
if (renameDiffs.length <= 0) {
return null;
}
boolean foundChild = false;
HashSet<String> excludeList = new HashSet<>();
for (DiffInfo diff : renameDiffs) {
if (isParentOf(newDir, diff.target)) {
foundChild = true;
excludeList.add(new Path(prefix, diff.target).toUri().getPath());
} else if (foundChild) {
// The renameDiffs was sorted, the matching section should be
// contiguous.
break;
}
}
return excludeList;
}
} }

View File

@ -24,6 +24,7 @@
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
@ -40,6 +41,7 @@
import java.io.*; import java.io.*;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet;
import static org.apache.hadoop.tools.DistCpConstants import static org.apache.hadoop.tools.DistCpConstants
.HDFS_RESERVED_RAW_DIRECTORY_NAME; .HDFS_RESERVED_RAW_DIRECTORY_NAME;
@ -59,6 +61,7 @@ public class SimpleCopyListing extends CopyListing {
private int numListstatusThreads = 1; private int numListstatusThreads = 1;
private final int maxRetries = 3; private final int maxRetries = 3;
private CopyFilter copyFilter; private CopyFilter copyFilter;
private DistCpSync distCpSync;
/** /**
* Protected constructor, to initialize configuration. * Protected constructor, to initialize configuration.
@ -77,12 +80,20 @@ protected SimpleCopyListing(Configuration configuration, Credentials credentials
} }
@VisibleForTesting @VisibleForTesting
protected SimpleCopyListing(Configuration configuration, Credentials credentials, protected SimpleCopyListing(Configuration configuration,
Credentials credentials,
int numListstatusThreads) { int numListstatusThreads) {
super(configuration, credentials); super(configuration, credentials);
this.numListstatusThreads = numListstatusThreads; this.numListstatusThreads = numListstatusThreads;
} }
protected SimpleCopyListing(Configuration configuration,
Credentials credentials,
DistCpSync distCpSync) {
this(configuration, credentials);
this.distCpSync = distCpSync;
}
@Override @Override
protected void validatePaths(DistCpOptions options) protected void validatePaths(DistCpOptions options)
throws IOException, InvalidInputException { throws IOException, InvalidInputException {
@ -157,8 +168,106 @@ protected void validatePaths(DistCpOptions options)
/** {@inheritDoc} */ /** {@inheritDoc} */
@Override @Override
public void doBuildListing(Path pathToListingFile, DistCpOptions options) throws IOException { public void doBuildListing(Path pathToListingFile, DistCpOptions options) throws IOException {
doBuildListing(getWriter(pathToListingFile), options); if(options.shouldUseDiff()) {
doBuildListingWithSnapshotDiff(getWriter(pathToListingFile), options);
}else {
doBuildListing(getWriter(pathToListingFile), options);
}
} }
/**
* Get a path with its scheme and authority.
*/
private Path getPathWithSchemeAndAuthority(Path path) throws IOException {
FileSystem fs= path.getFileSystem(getConf());
String scheme = path.toUri().getScheme();
if (scheme == null) {
scheme = fs.getUri().getScheme();
}
String authority = path.toUri().getAuthority();
if (authority == null) {
authority = fs.getUri().getAuthority();
}
return new Path(scheme, authority, path.toUri().getPath());
}
/**
* Write a single file/directory to the sequence file.
* @throws IOException
*/
private void addToFileListing(SequenceFile.Writer fileListWriter,
Path sourceRoot, Path path, DistCpOptions options) throws IOException {
sourceRoot = getPathWithSchemeAndAuthority(sourceRoot);
path = getPathWithSchemeAndAuthority(path);
path = makeQualified(path);
FileSystem sourceFS = sourceRoot.getFileSystem(getConf());
FileStatus fileStatus = sourceFS.getFileStatus(path);
final boolean preserveAcls = options.shouldPreserve(FileAttribute.ACL);
final boolean preserveXAttrs = options.shouldPreserve(FileAttribute.XATTR);
final boolean preserveRawXAttrs = options.shouldPreserveRawXattrs();
CopyListingFileStatus fileCopyListingStatus =
DistCpUtils.toCopyListingFileStatus(sourceFS, fileStatus,
preserveAcls, preserveXAttrs, preserveRawXAttrs);
writeToFileListingRoot(fileListWriter, fileCopyListingStatus,
sourceRoot, options);
}
/**
* Build a copy list based on the snapshot diff report.
*
* Any file/directory changed or created will be in the list. Deleted
* files/directories will not be in the list, since they are handled by
* {@link org.apache.hadoop.tools.DistCpSync#sync}. An item can be
* created/modified and renamed, in which case, the target path is put
* into the list.
* @throws IOException
*/
@VisibleForTesting
public void doBuildListingWithSnapshotDiff(SequenceFile.Writer fileListWriter,
DistCpOptions options) throws IOException {
ArrayList<DiffInfo> diffList = distCpSync.prepareDiffList();
Path sourceRoot = options.getSourcePaths().get(0);
FileSystem sourceFS = sourceRoot.getFileSystem(getConf());
try {
for (DiffInfo diff : diffList) {
// add snapshot paths prefix
diff.target = new Path(options.getSourcePaths().get(0), diff.target);
if (diff.getType() == SnapshotDiffReport.DiffType.MODIFY) {
addToFileListing(fileListWriter, sourceRoot, diff.target, options);
} else if (diff.getType() == SnapshotDiffReport.DiffType.CREATE) {
addToFileListing(fileListWriter, sourceRoot, diff.target, options);
FileStatus sourceStatus = sourceFS.getFileStatus(diff.target);
if (sourceStatus.isDirectory()) {
if (LOG.isDebugEnabled()) {
LOG.debug("Adding source dir for traverse: " +
sourceStatus.getPath());
}
HashSet<String> excludeList =
distCpSync.getTraverseExcludeList(diff.source,
options.getSourcePaths().get(0));
ArrayList<FileStatus> sourceDirs = new ArrayList<>();
sourceDirs.add(sourceStatus);
traverseDirectory(fileListWriter, sourceFS, sourceDirs,
sourceRoot, options, excludeList);
}
}
}
fileListWriter.close();
fileListWriter = null;
} finally {
IOUtils.cleanup(LOG, fileListWriter);
}
}
/** /**
* Collect the list of * Collect the list of
* {@literal <sourceRelativePath, sourceFileStatus>} * {@literal <sourceRelativePath, sourceFileStatus>}
@ -226,7 +335,7 @@ public void doBuildListing(SequenceFile.Writer fileListWriter,
} }
} }
traverseDirectory(fileListWriter, sourceFS, sourceDirs, traverseDirectory(fileListWriter, sourceFS, sourceDirs,
sourcePathRoot, options); sourcePathRoot, options, null);
} }
} }
fileListWriter.close(); fileListWriter.close();
@ -312,9 +421,33 @@ private SequenceFile.Writer getWriter(Path pathToListFile) throws IOException {
private static class FileStatusProcessor private static class FileStatusProcessor
implements WorkRequestProcessor<FileStatus, FileStatus[]> { implements WorkRequestProcessor<FileStatus, FileStatus[]> {
private FileSystem fileSystem; private FileSystem fileSystem;
private HashSet<String> excludeList;
public FileStatusProcessor(FileSystem fileSystem) { public FileStatusProcessor(FileSystem fileSystem,
HashSet<String> excludeList) {
this.fileSystem = fileSystem; this.fileSystem = fileSystem;
this.excludeList = excludeList;
}
/**
* Get FileStatuses for a given path.
* Exclude the some renamed FileStatuses since they are already handled by
* {@link org.apache.hadoop.tools.DistCpSync#sync}.
* @return an array of file status
*/
private FileStatus[] getFileStatus(Path path) throws IOException {
FileStatus[] fileStatuses = fileSystem.listStatus(path);
if (excludeList != null && excludeList.size() > 0) {
ArrayList<FileStatus> fileStatusList = new ArrayList<>();
for(FileStatus status : fileStatuses) {
if (!excludeList.contains(status.getPath().toUri().getPath())) {
fileStatusList.add(status);
}
}
fileStatuses = fileStatusList.toArray(
new FileStatus[fileStatusList.size()]);
}
return fileStatuses;
} }
/* /*
@ -344,8 +477,8 @@ public WorkReport<FileStatus[]> processItem(
LOG.debug("Interrupted while sleeping in exponential backoff."); LOG.debug("Interrupted while sleeping in exponential backoff.");
} }
} }
result = new WorkReport<FileStatus[]>( result = new WorkReport<FileStatus[]>(getFileStatus(parent.getPath()),
fileSystem.listStatus(parent.getPath()), retry, true); retry, true);
} catch (FileNotFoundException fnf) { } catch (FileNotFoundException fnf) {
LOG.error("FileNotFoundException exception in listStatus: " + LOG.error("FileNotFoundException exception in listStatus: " +
fnf.getMessage()); fnf.getMessage());
@ -376,7 +509,8 @@ private void traverseDirectory(SequenceFile.Writer fileListWriter,
FileSystem sourceFS, FileSystem sourceFS,
ArrayList<FileStatus> sourceDirs, ArrayList<FileStatus> sourceDirs,
Path sourcePathRoot, Path sourcePathRoot,
DistCpOptions options) DistCpOptions options,
HashSet<String> excludeList)
throws IOException { throws IOException {
final boolean preserveAcls = options.shouldPreserve(FileAttribute.ACL); final boolean preserveAcls = options.shouldPreserve(FileAttribute.ACL);
final boolean preserveXAttrs = options.shouldPreserve(FileAttribute.XATTR); final boolean preserveXAttrs = options.shouldPreserve(FileAttribute.XATTR);
@ -389,7 +523,8 @@ private void traverseDirectory(SequenceFile.Writer fileListWriter,
new ProducerConsumer<FileStatus, FileStatus[]>(numListstatusThreads); new ProducerConsumer<FileStatus, FileStatus[]>(numListstatusThreads);
for (int i = 0; i < numListstatusThreads; i++) { for (int i = 0; i < numListstatusThreads; i++) {
workers.addWorker( workers.addWorker(
new FileStatusProcessor(sourcePathRoot.getFileSystem(getConf()))); new FileStatusProcessor(sourcePathRoot.getFileSystem(getConf()),
excludeList));
} }
for (FileStatus status : sourceDirs) { for (FileStatus status : sourceDirs) {

View File

@ -62,7 +62,6 @@ public void setUp() throws Exception {
options = new DistCpOptions(Arrays.asList(source), target); options = new DistCpOptions(Arrays.asList(source), target);
options.setSyncFolder(true); options.setSyncFolder(true);
options.setDeleteMissing(true);
options.setUseDiff(true, "s1", "s2"); options.setUseDiff(true, "s1", "s2");
options.appendToConf(conf); options.appendToConf(conf);
@ -87,7 +86,7 @@ public void tearDown() throws Exception {
@Test @Test
public void testFallback() throws Exception { public void testFallback() throws Exception {
// the source/target dir are not snapshottable dir // the source/target dir are not snapshottable dir
Assert.assertFalse(DistCpSync.sync(options, conf)); Assert.assertFalse(sync());
// make sure the source path has been updated to the snapshot path // make sure the source path has been updated to the snapshot path
final Path spath = new Path(source, final Path spath = new Path(source,
HdfsConstants.DOT_SNAPSHOT_DIR + Path.SEPARATOR + "s2"); HdfsConstants.DOT_SNAPSHOT_DIR + Path.SEPARATOR + "s2");
@ -98,7 +97,7 @@ public void testFallback() throws Exception {
// the source/target does not have the given snapshots // the source/target does not have the given snapshots
dfs.allowSnapshot(source); dfs.allowSnapshot(source);
dfs.allowSnapshot(target); dfs.allowSnapshot(target);
Assert.assertFalse(DistCpSync.sync(options, conf)); Assert.assertFalse(sync());
Assert.assertEquals(spath, options.getSourcePaths().get(0)); Assert.assertEquals(spath, options.getSourcePaths().get(0));
// reset source path in options // reset source path in options
@ -106,21 +105,38 @@ public void testFallback() throws Exception {
dfs.createSnapshot(source, "s1"); dfs.createSnapshot(source, "s1");
dfs.createSnapshot(source, "s2"); dfs.createSnapshot(source, "s2");
dfs.createSnapshot(target, "s1"); dfs.createSnapshot(target, "s1");
Assert.assertTrue(DistCpSync.sync(options, conf)); Assert.assertTrue(sync());
// reset source paths in options // reset source paths in options
options.setSourcePaths(Arrays.asList(source)); options.setSourcePaths(Arrays.asList(source));
// changes have been made in target // changes have been made in target
final Path subTarget = new Path(target, "sub"); final Path subTarget = new Path(target, "sub");
dfs.mkdirs(subTarget); dfs.mkdirs(subTarget);
Assert.assertFalse(DistCpSync.sync(options, conf)); Assert.assertFalse(sync());
// make sure the source path has been updated to the snapshot path // make sure the source path has been updated to the snapshot path
Assert.assertEquals(spath, options.getSourcePaths().get(0)); Assert.assertEquals(spath, options.getSourcePaths().get(0));
// reset source paths in options // reset source paths in options
options.setSourcePaths(Arrays.asList(source)); options.setSourcePaths(Arrays.asList(source));
dfs.delete(subTarget, true); dfs.delete(subTarget, true);
Assert.assertTrue(DistCpSync.sync(options, conf)); Assert.assertTrue(sync());
}
private void enableAndCreateFirstSnapshot() throws Exception {
dfs.allowSnapshot(source);
dfs.allowSnapshot(target);
dfs.createSnapshot(source, "s1");
dfs.createSnapshot(target, "s1");
}
private void syncAndVerify() throws Exception {
Assert.assertTrue(sync());
verifyCopy(dfs.getFileStatus(source), dfs.getFileStatus(target), false);
}
private boolean sync() throws Exception {
DistCpSync distCpSync = new DistCpSync(options, conf);
return distCpSync.sync();
} }
/** /**
@ -164,23 +180,30 @@ private void initData(Path dir) throws Exception {
* foo/ f4 * foo/ f4
* f1(new) * f1(new)
*/ */
private void changeData(Path dir) throws Exception { private int changeData(Path dir) throws Exception {
final Path foo = new Path(dir, "foo"); final Path foo = new Path(dir, "foo");
final Path bar = new Path(dir, "bar"); final Path bar = new Path(dir, "bar");
final Path d1 = new Path(foo, "d1"); final Path d1 = new Path(foo, "d1");
final Path f2 = new Path(bar, "f2"); final Path f2 = new Path(bar, "f2");
final Path bar_d1 = new Path(bar, "d1"); final Path bar_d1 = new Path(bar, "d1");
int numCreatedModified = 0;
dfs.rename(d1, bar_d1); dfs.rename(d1, bar_d1);
numCreatedModified += 1; // modify ./foo
numCreatedModified += 1; // modify ./bar
final Path f3 = new Path(bar_d1, "f3"); final Path f3 = new Path(bar_d1, "f3");
dfs.delete(f3, true); dfs.delete(f3, true);
final Path newfoo = new Path(bar_d1, "foo"); final Path newfoo = new Path(bar_d1, "foo");
dfs.rename(foo, newfoo); dfs.rename(foo, newfoo);
numCreatedModified += 1; // modify ./foo/d1
final Path f1 = new Path(newfoo, "f1"); final Path f1 = new Path(newfoo, "f1");
dfs.delete(f1, true); dfs.delete(f1, true);
DFSTestUtil.createFile(dfs, f1, 2 * BLOCK_SIZE, DATA_NUM, 0); DFSTestUtil.createFile(dfs, f1, 2 * BLOCK_SIZE, DATA_NUM, 0);
numCreatedModified += 1; // create ./foo/f1
DFSTestUtil.appendFile(dfs, f2, (int) BLOCK_SIZE); DFSTestUtil.appendFile(dfs, f2, (int) BLOCK_SIZE);
numCreatedModified += 1; // modify ./bar/f2
dfs.rename(bar, new Path(dir, "foo")); dfs.rename(bar, new Path(dir, "foo"));
return numCreatedModified;
} }
/** /**
@ -190,13 +213,10 @@ private void changeData(Path dir) throws Exception {
public void testSync() throws Exception { public void testSync() throws Exception {
initData(source); initData(source);
initData(target); initData(target);
dfs.allowSnapshot(source); enableAndCreateFirstSnapshot();
dfs.allowSnapshot(target);
dfs.createSnapshot(source, "s1");
dfs.createSnapshot(target, "s1");
// make changes under source // make changes under source
changeData(source); int numCreatedModified = changeData(source);
dfs.createSnapshot(source, "s2"); dfs.createSnapshot(source, "s2");
// before sync, make some further changes on source. this should not affect // before sync, make some further changes on source. this should not affect
@ -206,17 +226,22 @@ public void testSync() throws Exception {
final Path newdir = new Path(source, "foo/d1/foo/newdir"); final Path newdir = new Path(source, "foo/d1/foo/newdir");
dfs.mkdirs(newdir); dfs.mkdirs(newdir);
SnapshotDiffReport report = dfs.getSnapshotDiffReport(source, "s1", "s2");
System.out.println(report);
DistCpSync distCpSync = new DistCpSync(options, conf);
// do the sync // do the sync
Assert.assertTrue(DistCpSync.sync(options, conf)); Assert.assertTrue(distCpSync.sync());
// make sure the source path has been updated to the snapshot path // make sure the source path has been updated to the snapshot path
final Path spath = new Path(source, final Path spath = new Path(source,
HdfsConstants.DOT_SNAPSHOT_DIR + Path.SEPARATOR + "s2"); HdfsConstants.DOT_SNAPSHOT_DIR + Path.SEPARATOR + "s2");
Assert.assertEquals(spath, options.getSourcePaths().get(0)); Assert.assertEquals(spath, options.getSourcePaths().get(0));
// build copy listing // build copy listing
final Path listingPath = new Path("/tmp/META/fileList.seq"); final Path listingPath = new Path("/tmp/META/fileList.seq");
CopyListing listing = new GlobbedCopyListing(conf, new Credentials()); CopyListing listing = new SimpleCopyListing(conf, new Credentials(), distCpSync);
listing.buildListing(listingPath, options); listing.buildListing(listingPath, options);
Map<Text, CopyListingFileStatus> copyListing = getListing(listingPath); Map<Text, CopyListingFileStatus> copyListing = getListing(listingPath);
@ -232,6 +257,9 @@ public void testSync() throws Exception {
copyMapper.map(entry.getKey(), entry.getValue(), context); copyMapper.map(entry.getKey(), entry.getValue(), context);
} }
// verify that we only list modified and created files/directories
Assert.assertEquals(numCreatedModified, copyListing.size());
// verify that we only copied new appended data of f2 and the new file f1 // verify that we only copied new appended data of f2 and the new file f1
Assert.assertEquals(BLOCK_SIZE * 3, stubContext.getReporter() Assert.assertEquals(BLOCK_SIZE * 3, stubContext.getReporter()
.getCounter(CopyMapper.Counter.BYTESCOPIED).getValue()); .getCounter(CopyMapper.Counter.BYTESCOPIED).getValue());
@ -285,16 +313,13 @@ public void testSyncWithCurrent() throws Exception {
options.setUseDiff(true, "s1", "."); options.setUseDiff(true, "s1", ".");
initData(source); initData(source);
initData(target); initData(target);
dfs.allowSnapshot(source); enableAndCreateFirstSnapshot();
dfs.allowSnapshot(target);
dfs.createSnapshot(source, "s1");
dfs.createSnapshot(target, "s1");
// make changes under source // make changes under source
changeData(source); changeData(source);
// do the sync // do the sync
Assert.assertTrue(DistCpSync.sync(options, conf)); sync();
// make sure the source path is still unchanged // make sure the source path is still unchanged
Assert.assertEquals(source, options.getSourcePaths().get(0)); Assert.assertEquals(source, options.getSourcePaths().get(0));
} }
@ -328,10 +353,7 @@ private void changeData2(Path dir) throws Exception {
public void testSync2() throws Exception { public void testSync2() throws Exception {
initData2(source); initData2(source);
initData2(target); initData2(target);
dfs.allowSnapshot(source); enableAndCreateFirstSnapshot();
dfs.allowSnapshot(target);
dfs.createSnapshot(source, "s1");
dfs.createSnapshot(target, "s1");
// make changes under source // make changes under source
changeData2(source); changeData2(source);
@ -340,9 +362,7 @@ public void testSync2() throws Exception {
SnapshotDiffReport report = dfs.getSnapshotDiffReport(source, "s1", "s2"); SnapshotDiffReport report = dfs.getSnapshotDiffReport(source, "s1", "s2");
System.out.println(report); System.out.println(report);
// do the sync syncAndVerify();
Assert.assertTrue(DistCpSync.sync(options, conf));
verifyCopy(dfs.getFileStatus(source), dfs.getFileStatus(target), false);
} }
private void initData3(Path dir) throws Exception { private void initData3(Path dir) throws Exception {
@ -375,16 +395,13 @@ private void changeData3(Path dir) throws Exception {
} }
/** /**
* Test a case where there are multiple source files with the same name * Test a case where there are multiple source files with the same name.
*/ */
@Test @Test
public void testSync3() throws Exception { public void testSync3() throws Exception {
initData3(source); initData3(source);
initData3(target); initData3(target);
dfs.allowSnapshot(source); enableAndCreateFirstSnapshot();
dfs.allowSnapshot(target);
dfs.createSnapshot(source, "s1");
dfs.createSnapshot(target, "s1");
// make changes under source // make changes under source
changeData3(source); changeData3(source);
@ -393,8 +410,268 @@ public void testSync3() throws Exception {
SnapshotDiffReport report = dfs.getSnapshotDiffReport(source, "s1", "s2"); SnapshotDiffReport report = dfs.getSnapshotDiffReport(source, "s1", "s2");
System.out.println(report); System.out.println(report);
syncAndVerify();
}
private void initData4(Path dir) throws Exception {
final Path d1 = new Path(dir, "d1");
final Path d2 = new Path(d1, "d2");
final Path f1 = new Path(d2, "f1");
DFSTestUtil.createFile(dfs, f1, BLOCK_SIZE, DATA_NUM, 0L);
}
private void changeData4(Path dir) throws Exception {
final Path d1 = new Path(dir, "d1");
final Path d11 = new Path(dir, "d11");
final Path d2 = new Path(d1, "d2");
final Path d21 = new Path(d1, "d21");
final Path f1 = new Path(d2, "f1");
dfs.delete(f1, false);
dfs.rename(d2, d21);
dfs.rename(d1, d11);
}
/**
* Test a case where multiple level dirs are renamed.
*/
@Test
public void testSync4() throws Exception {
initData4(source);
initData4(target);
enableAndCreateFirstSnapshot();
// make changes under source
changeData4(source);
dfs.createSnapshot(source, "s2");
SnapshotDiffReport report = dfs.getSnapshotDiffReport(source, "s1", "s2");
System.out.println(report);
syncAndVerify();
}
private void initData5(Path dir) throws Exception {
final Path d1 = new Path(dir, "d1");
final Path d2 = new Path(dir, "d2");
final Path f1 = new Path(d1, "f1");
final Path f2 = new Path(d2, "f2");
DFSTestUtil.createFile(dfs, f1, BLOCK_SIZE, DATA_NUM, 0L);
DFSTestUtil.createFile(dfs, f2, BLOCK_SIZE, DATA_NUM, 0L);
}
private void changeData5(Path dir) throws Exception {
final Path d1 = new Path(dir, "d1");
final Path d2 = new Path(dir, "d2");
final Path f1 = new Path(d1, "f1");
final Path tmp = new Path(dir, "tmp");
dfs.delete(f1, false);
dfs.rename(d1, tmp);
dfs.rename(d2, d1);
final Path f2 = new Path(d1, "f2");
dfs.delete(f2, false);
}
/**
* Test a case with different delete and rename sequences.
*/
@Test
public void testSync5() throws Exception {
initData5(source);
initData5(target);
enableAndCreateFirstSnapshot();
// make changes under source
changeData5(source);
dfs.createSnapshot(source, "s2");
SnapshotDiffReport report = dfs.getSnapshotDiffReport(source, "s1", "s2");
System.out.println(report);
syncAndVerify();
}
private void testAndVerify(int numCreatedModified)
throws Exception{
SnapshotDiffReport report = dfs.getSnapshotDiffReport(source, "s1", "s2");
System.out.println(report);
DistCpSync distCpSync = new DistCpSync(options, conf);
// do the sync // do the sync
Assert.assertTrue(DistCpSync.sync(options, conf)); Assert.assertTrue(distCpSync.sync());
verifyCopy(dfs.getFileStatus(source), dfs.getFileStatus(target), false);
// make sure the source path has been updated to the snapshot path
final Path spath = new Path(source,
HdfsConstants.DOT_SNAPSHOT_DIR + Path.SEPARATOR + "s2");
Assert.assertEquals(spath, options.getSourcePaths().get(0));
// build copy listing
final Path listingPath = new Path("/tmp/META/fileList.seq");
CopyListing listing = new SimpleCopyListing(conf, new Credentials(), distCpSync);
listing.buildListing(listingPath, options);
Map<Text, CopyListingFileStatus> copyListing = getListing(listingPath);
CopyMapper copyMapper = new CopyMapper();
StubContext stubContext = new StubContext(conf, null, 0);
Mapper<Text, CopyListingFileStatus, Text, Text>.Context context =
stubContext.getContext();
// Enable append
context.getConfiguration().setBoolean(
DistCpOptionSwitch.APPEND.getConfigLabel(), true);
copyMapper.setup(context);
for (Map.Entry<Text, CopyListingFileStatus> entry :
copyListing.entrySet()) {
copyMapper.map(entry.getKey(), entry.getValue(), context);
}
// verify that we only list modified and created files/directories
Assert.assertEquals(numCreatedModified, copyListing.size());
// verify the source and target now has the same structure
verifyCopy(dfs.getFileStatus(spath), dfs.getFileStatus(target), false);
}
private void initData6(Path dir) throws Exception {
final Path foo = new Path(dir, "foo");
final Path bar = new Path(dir, "bar");
final Path foo_f1 = new Path(foo, "f1");
final Path bar_f1 = new Path(bar, "f1");
DFSTestUtil.createFile(dfs, foo_f1, BLOCK_SIZE, DATA_NUM, 0L);
DFSTestUtil.createFile(dfs, bar_f1, BLOCK_SIZE, DATA_NUM, 0L);
}
private int changeData6(Path dir) throws Exception {
final Path foo = new Path(dir, "foo");
final Path bar = new Path(dir, "bar");
final Path foo2 = new Path(dir, "foo2");
final Path foo_f1 = new Path(foo, "f1");
int numCreatedModified = 0;
dfs.rename(foo, foo2);
dfs.rename(bar, foo);
dfs.rename(foo2, bar);
DFSTestUtil.appendFile(dfs, foo_f1, (int) BLOCK_SIZE);
numCreatedModified += 1; // modify ./bar/f1
return numCreatedModified;
}
/**
* Test a case where there is a cycle in renaming dirs.
*/
@Test
public void testSync6() throws Exception {
initData6(source);
initData6(target);
enableAndCreateFirstSnapshot();
int numCreatedModified = changeData6(source);
dfs.createSnapshot(source, "s2");
testAndVerify(numCreatedModified);
}
private void initData7(Path dir) throws Exception {
final Path foo = new Path(dir, "foo");
final Path bar = new Path(dir, "bar");
final Path foo_f1 = new Path(foo, "f1");
final Path bar_f1 = new Path(bar, "f1");
DFSTestUtil.createFile(dfs, foo_f1, BLOCK_SIZE, DATA_NUM, 0L);
DFSTestUtil.createFile(dfs, bar_f1, BLOCK_SIZE, DATA_NUM, 0L);
}
private int changeData7(Path dir) throws Exception {
final Path foo = new Path(dir, "foo");
final Path foo2 = new Path(dir, "foo2");
final Path foo_f1 = new Path(foo, "f1");
final Path foo2_f2 = new Path(foo2, "f2");
final Path foo_d1 = new Path(foo, "d1");
final Path foo_d1_f3 = new Path(foo_d1, "f3");
int numCreatedModified = 0;
dfs.rename(foo, foo2);
DFSTestUtil.createFile(dfs, foo_f1, BLOCK_SIZE, DATA_NUM, 0L);
numCreatedModified += 2; // create ./foo and ./foo/f1
DFSTestUtil.appendFile(dfs, foo_f1, (int) BLOCK_SIZE);
dfs.rename(foo_f1, foo2_f2);
numCreatedModified -= 1; // mv ./foo/f1
numCreatedModified += 2; // "M ./foo" and "+ ./foo/f2"
DFSTestUtil.createFile(dfs, foo_d1_f3, BLOCK_SIZE, DATA_NUM, 0L);
numCreatedModified += 2; // create ./foo/d1 and ./foo/d1/f3
return numCreatedModified;
}
/**
* Test a case where rename a dir, then create a new dir with the same name
* and sub dir.
*/
@Test
public void testSync7() throws Exception {
initData7(source);
initData7(target);
enableAndCreateFirstSnapshot();
int numCreatedModified = changeData7(source);
dfs.createSnapshot(source, "s2");
testAndVerify(numCreatedModified);
}
private void initData8(Path dir) throws Exception {
final Path foo = new Path(dir, "foo");
final Path bar = new Path(dir, "bar");
final Path d1 = new Path(dir, "d1");
final Path foo_f1 = new Path(foo, "f1");
final Path bar_f1 = new Path(bar, "f1");
final Path d1_f1 = new Path(d1, "f1");
DFSTestUtil.createFile(dfs, foo_f1, BLOCK_SIZE, DATA_NUM, 0L);
DFSTestUtil.createFile(dfs, bar_f1, BLOCK_SIZE, DATA_NUM, 0L);
DFSTestUtil.createFile(dfs, d1_f1, BLOCK_SIZE, DATA_NUM, 0L);
}
private int changeData8(Path dir) throws Exception {
final Path foo = new Path(dir, "foo");
final Path createdDir = new Path(dir, "c");
final Path d1 = new Path(dir, "d1");
final Path d1_f1 = new Path(d1, "f1");
final Path createdDir_f1 = new Path(createdDir, "f1");
final Path foo_f3 = new Path(foo, "f3");
final Path new_foo = new Path(createdDir, "foo");
final Path foo_f4 = new Path(foo, "f4");
final Path foo_d1 = new Path(foo, "d1");
final Path bar = new Path(dir, "bar");
final Path bar1 = new Path(dir, "bar1");
int numCreatedModified = 0;
DFSTestUtil.createFile(dfs, foo_f3, BLOCK_SIZE, DATA_NUM, 0L);
numCreatedModified += 1; // create ./c/foo/f3
DFSTestUtil.createFile(dfs, createdDir_f1, BLOCK_SIZE, DATA_NUM, 0L);
numCreatedModified += 1; // create ./c
dfs.rename(createdDir_f1, foo_f4);
numCreatedModified += 1; // create ./c/foo/f4
dfs.rename(d1_f1, createdDir_f1); // rename ./d1/f1 -> ./c/f1
numCreatedModified += 1; // modify ./c/foo/d1
dfs.rename(d1, foo_d1);
numCreatedModified += 1; // modify ./c/foo
dfs.rename(foo, new_foo);
dfs.rename(bar, bar1);
return numCreatedModified;
}
/**
* Test a case where create a dir, then mv a existed dir into it.
*/
@Test
public void testSync8() throws Exception {
initData8(source);
initData8(target);
enableAndCreateFirstSnapshot();
int numCreatedModified = changeData8(source);
dfs.createSnapshot(source, "s2");
testAndVerify(numCreatedModified);
} }
} }

View File

@ -657,7 +657,7 @@ public void testDiffOption() {
false)); false));
DistCpOptions options = OptionsParser.parse(new String[] { "-update", DistCpOptions options = OptionsParser.parse(new String[] { "-update",
"-delete", "-diff", "s1", "s2", "-diff", "s1", "s2",
"hdfs://localhost:8020/source/first", "hdfs://localhost:8020/source/first",
"hdfs://localhost:8020/target/" }); "hdfs://localhost:8020/target/" });
options.appendToConf(conf); options.appendToConf(conf);
@ -667,7 +667,7 @@ public void testDiffOption() {
Assert.assertEquals("s2", options.getToSnapshot()); Assert.assertEquals("s2", options.getToSnapshot());
options = OptionsParser.parse(new String[] { options = OptionsParser.parse(new String[] {
"-delete", "-diff", "s1", ".", "-update", "-diff", "s1", ".", "-update",
"hdfs://localhost:8020/source/first", "hdfs://localhost:8020/source/first",
"hdfs://localhost:8020/target/" }); "hdfs://localhost:8020/target/" });
options.appendToConf(conf); options.appendToConf(conf);
@ -679,7 +679,7 @@ public void testDiffOption() {
// -diff requires two option values // -diff requires two option values
try { try {
OptionsParser.parse(new String[] {"-diff", "s1", "-delete", "-update", OptionsParser.parse(new String[] {"-diff", "s1", "-update",
"hdfs://localhost:8020/source/first", "hdfs://localhost:8020/source/first",
"hdfs://localhost:8020/target/" }); "hdfs://localhost:8020/target/" });
fail("-diff should fail with only one snapshot name"); fail("-diff should fail with only one snapshot name");
@ -688,25 +688,25 @@ public void testDiffOption() {
"Must provide both the starting and ending snapshot names", e); "Must provide both the starting and ending snapshot names", e);
} }
// make sure -diff is only valid when -update and -delete is specified // make sure -diff is only valid when -update is specified
try { try {
OptionsParser.parse(new String[] { "-diff", "s1", "s2", OptionsParser.parse(new String[] { "-diff", "s1", "s2",
"hdfs://localhost:8020/source/first", "hdfs://localhost:8020/source/first",
"hdfs://localhost:8020/target/" }); "hdfs://localhost:8020/target/" });
fail("-diff should fail if -update or -delete option is not specified"); fail("-diff should fail if -update option is not specified");
} catch (IllegalArgumentException e) { } catch (IllegalArgumentException e) {
GenericTestUtils.assertExceptionContains( GenericTestUtils.assertExceptionContains(
"Diff is valid only with update and delete options", e); "Diff is valid only with update options", e);
} }
try { try {
OptionsParser.parse(new String[] { "-diff", "s1", "s2", "-update", OptionsParser.parse(new String[] { "-diff", "s1", "s2", "-update", "-delete",
"hdfs://localhost:8020/source/first", "hdfs://localhost:8020/source/first",
"hdfs://localhost:8020/target/" }); "hdfs://localhost:8020/target/" });
fail("-diff should fail if -update or -delete option is not specified"); fail("-diff should fail if -delete option is specified");
} catch (IllegalArgumentException e) { } catch (IllegalArgumentException e) {
GenericTestUtils.assertExceptionContains( GenericTestUtils.assertExceptionContains(
"Diff is valid only with update and delete options", e); "Diff is valid only with update options", e);
} }
try { try {
@ -714,10 +714,10 @@ public void testDiffOption() {
"-delete", "-overwrite", "-delete", "-overwrite",
"hdfs://localhost:8020/source/first", "hdfs://localhost:8020/source/first",
"hdfs://localhost:8020/target/" }); "hdfs://localhost:8020/target/" });
fail("-diff should fail if -update or -delete option is not specified"); fail("-diff should fail if -update option is not specified");
} catch (IllegalArgumentException e) { } catch (IllegalArgumentException e) {
GenericTestUtils.assertExceptionContains( GenericTestUtils.assertExceptionContains(
"Diff is valid only with update and delete options", e); "Diff is valid only with update options", e);
} }
} }