From 7ef4d942dd96232b0743a40ed25f77065254f94d Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Thu, 8 Mar 2018 11:24:06 +0000 Subject: [PATCH] HADOOP-15273.distcp can't handle remote stores with different checksum algorithms. Contributed by Steve Loughran. --- .../apache/hadoop/tools/DistCpOptions.java | 5 ---- .../mapred/RetriableFileCopyCommand.java | 29 ++++++++++++++----- .../hadoop/tools/mapred/TestCopyMapper.java | 14 ++++----- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java index ece1a9426e..f33f7fdcb9 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java @@ -534,11 +534,6 @@ private void validate() { + "mutually exclusive"); } - if (!syncFolder && skipCRC) { - throw new IllegalArgumentException( - "Skip CRC is valid only with update options"); - } - if (!syncFolder && append) { throw new IllegalArgumentException( "Append is valid only with update options"); diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java index 031106134c..55f90d032c 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java @@ -210,15 +210,30 @@ private void compareCheckSums(FileSystem sourceFS, Path source, throws IOException { if (!DistCpUtils.checksumsAreEqual(sourceFS, source, sourceChecksum, targetFS, target)) { - StringBuilder errorMessage = new StringBuilder("Check-sum mismatch between ") - .append(source).append(" and ").append(target).append("."); - if (sourceFS.getFileStatus(source).getBlockSize() != + StringBuilder errorMessage = + new StringBuilder("Checksum mismatch between ") + .append(source).append(" and ").append(target).append("."); + boolean addSkipHint = false; + String srcScheme = sourceFS.getScheme(); + String targetScheme = targetFS.getScheme(); + if (!srcScheme.equals(targetScheme) + && !(srcScheme.contains("hdfs") && targetScheme.contains("hdfs"))) { + // the filesystems are different and they aren't both hdfs connectors + errorMessage.append("Source and destination filesystems are of" + + " different types\n") + .append("Their checksum algorithms may be incompatible"); + addSkipHint = true; + } else if (sourceFS.getFileStatus(source).getBlockSize() != targetFS.getFileStatus(target).getBlockSize()) { - errorMessage.append(" Source and target differ in block-size.") - .append(" Use -pb to preserve block-sizes during copy.") - .append(" Alternatively, skip checksum-checks altogether, using -skipCrc.") + errorMessage.append(" Source and target differ in block-size.\n") + .append(" Use -pb to preserve block-sizes during copy."); + addSkipHint = true; + } + if (addSkipHint) { + errorMessage.append(" You can skip checksum-checks altogether " + + " with -skipcrccheck.\n") .append(" (NOTE: By skipping checksums, one runs the risk of " + - "masking data-corruption during file-transfer.)"); + "masking data-corruption during file-transfer.)\n"); } throw new IOException(errorMessage.toString()); } diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java index da51326454..bbe7e8d925 100644 --- a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java @@ -45,6 +45,7 @@ import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.tools.CopyListingFileStatus; import org.apache.hadoop.tools.DistCpConstants; import org.apache.hadoop.tools.DistCpOptionSwitch; @@ -937,7 +938,7 @@ public void testPreserveBlockSizeAndReplication() { } @Test(timeout=40000) - public void testCopyFailOnBlockSizeDifference() { + public void testCopyFailOnBlockSizeDifference() throws Exception { try { deleteState(); createSourceDataWithDifferentBlockSize(); @@ -964,12 +965,11 @@ public void testCopyFailOnBlockSizeDifference() { Assert.fail("Copy should have failed because of block-size difference."); } - catch (Exception exception) { - // Check that the exception suggests the use of -pb/-skipCrc. - Assert.assertTrue("Failure exception should have suggested the use of -pb.", - exception.getCause().getCause().getMessage().contains("pb")); - Assert.assertTrue("Failure exception should have suggested the use of -skipCrc.", - exception.getCause().getCause().getMessage().contains("skipCrc")); + catch (IOException exception) { + // Check that the exception suggests the use of -pb/-skipcrccheck. + Throwable cause = exception.getCause().getCause(); + GenericTestUtils.assertExceptionContains("-pb", cause); + GenericTestUtils.assertExceptionContains("-skipcrccheck", cause); } }