From c918286b17e7f2a64735d0c972a8dd749e0bf6c4 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Fri, 20 May 2016 12:21:35 +0100 Subject: [PATCH] HADOOP-13145 In DistCp, prevent unnecessary getFileStatus call when not preserving metadata. Contributed by Chris Nauroth. --- hadoop-project/pom.xml | 6 + hadoop-tools/hadoop-aws/pom.xml | 11 + .../site/markdown/tools/hadoop-aws/index.md | 9 + .../contract/s3a/TestS3AContractDistCp.java | 46 ++++ hadoop-tools/hadoop-azure/pom.xml | 19 ++ .../TestAzureNativeContractDistCp.java | 33 +++ hadoop-tools/hadoop-distcp/pom.xml | 16 ++ .../apache/hadoop/tools/util/DistCpUtils.java | 10 +- .../contract/AbstractContractDistCpTest.java | 204 ++++++++++++++++++ 9 files changed, 351 insertions(+), 3 deletions(-) create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/TestS3AContractDistCp.java create mode 100644 hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azure/contract/TestAzureNativeContractDistCp.java create mode 100644 hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/contract/AbstractContractDistCpTest.java diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml index 3a2f9d9064..bee2e5810c 100644 --- a/hadoop-project/pom.xml +++ b/hadoop-project/pom.xml @@ -353,6 +353,12 @@ hadoop-distcp ${project.version} + + org.apache.hadoop + hadoop-distcp + ${project.version} + test-jar + org.apache.hadoop hadoop-datajoin diff --git a/hadoop-tools/hadoop-aws/pom.xml b/hadoop-tools/hadoop-aws/pom.xml index dfcb1b0fbe..c95f1e6cd9 100644 --- a/hadoop-tools/hadoop-aws/pom.xml +++ b/hadoop-tools/hadoop-aws/pom.xml @@ -252,5 +252,16 @@ test jar + + org.apache.hadoop + hadoop-distcp + test + + + org.apache.hadoop + hadoop-distcp + test + test-jar + diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md index 95e3274dd7..fe81400d3d 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md @@ -731,6 +731,15 @@ or in batch runs. Smaller values should result in faster test runs, especially when the object store is a long way away. +DistCp tests targeting S3A support a configurable file size. The default is +10 MB, but the configuration value is expressed in KB so that it can be tuned +smaller to achieve faster test runs. + + + scale.test.distcp.file.size.kb + 10240 + + ### Running the Tests After completing the configuration, execute the test run through Maven. diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/TestS3AContractDistCp.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/TestS3AContractDistCp.java new file mode 100644 index 0000000000..7eb0afab13 --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/TestS3AContractDistCp.java @@ -0,0 +1,46 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.contract.s3a; + +import static org.apache.hadoop.fs.s3a.Constants.MIN_MULTIPART_THRESHOLD; +import static org.apache.hadoop.fs.s3a.Constants.MULTIPART_SIZE; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.tools.contract.AbstractContractDistCpTest; + +/** + * Contract test suite covering S3A integration with DistCp. + */ +public class TestS3AContractDistCp extends AbstractContractDistCpTest { + + private static final long MULTIPART_SETTING = 8 * 1024 * 1024; // 8 MB + + @Override + protected Configuration createConfiguration() { + Configuration newConf = super.createConfiguration(); + newConf.setLong(MIN_MULTIPART_THRESHOLD, MULTIPART_SETTING); + newConf.setLong(MULTIPART_SIZE, MULTIPART_SETTING); + return newConf; + } + + @Override + protected S3AContract createContract(Configuration conf) { + return new S3AContract(conf); + } +} diff --git a/hadoop-tools/hadoop-azure/pom.xml b/hadoop-tools/hadoop-azure/pom.xml index 8344ed719f..02a1240141 100644 --- a/hadoop-tools/hadoop-azure/pom.xml +++ b/hadoop-tools/hadoop-azure/pom.xml @@ -193,6 +193,25 @@ test-jar + + org.apache.hadoop + hadoop-mapreduce-client-jobclient + test + + + + org.apache.hadoop + hadoop-distcp + test + + + + org.apache.hadoop + hadoop-distcp + test + test-jar + + org.mockito mockito-all diff --git a/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azure/contract/TestAzureNativeContractDistCp.java b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azure/contract/TestAzureNativeContractDistCp.java new file mode 100644 index 0000000000..a3750d46ab --- /dev/null +++ b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azure/contract/TestAzureNativeContractDistCp.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.azure.contract; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.tools.contract.AbstractContractDistCpTest; + +/** + * Contract test suite covering WASB integration with DistCp. + */ +public class TestAzureNativeContractDistCp extends AbstractContractDistCpTest { + + @Override + protected NativeAzureFileSystemContract createContract(Configuration conf) { + return new NativeAzureFileSystemContract(conf); + } +} diff --git a/hadoop-tools/hadoop-distcp/pom.xml b/hadoop-tools/hadoop-distcp/pom.xml index 4ea38c302c..2cec22f0ad 100644 --- a/hadoop-tools/hadoop-distcp/pom.xml +++ b/hadoop-tools/hadoop-distcp/pom.xml @@ -186,6 +186,22 @@ + + + prepare-jar + prepare-package + + jar + + + + prepare-test-jar + prepare-package + + test-jar + + + org.apache.maven.plugins diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java index d3d7677ecf..1784c5de51 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java @@ -195,9 +195,13 @@ public static void preserve(FileSystem targetFS, Path path, EnumSet attributes, boolean preserveRawXattrs) throws IOException { - FileStatus targetFileStatus = targetFS.getFileStatus(path); - String group = targetFileStatus.getGroup(); - String user = targetFileStatus.getOwner(); + // If not preserving anything from FileStatus, don't bother fetching it. + FileStatus targetFileStatus = attributes.isEmpty() ? null : + targetFS.getFileStatus(path); + String group = targetFileStatus == null ? null : + targetFileStatus.getGroup(); + String user = targetFileStatus == null ? null : + targetFileStatus.getOwner(); boolean chown = false; if (attributes.contains(FileAttribute.ACL)) { diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/contract/AbstractContractDistCpTest.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/contract/AbstractContractDistCpTest.java new file mode 100644 index 0000000000..a4f50c71ab --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/contract/AbstractContractDistCpTest.java @@ -0,0 +1,204 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.contract; + +import static org.apache.hadoop.fs.contract.ContractTestUtils.*; +import static org.junit.Assert.*; + +import java.util.Arrays; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.contract.AbstractFSContractTestBase; +import org.apache.hadoop.fs.contract.ContractTestUtils; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.test.GenericTestUtils; +import org.apache.hadoop.tools.DistCp; +import org.apache.hadoop.tools.DistCpOptions; + +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; + +/** + * Contract test suite covering a file system's integration with DistCp. The + * tests coordinate two file system instances: one "local", which is the local + * file system, and the other "remote", which is the file system implementation + * under test. The tests in the suite cover both copying from local to remote + * (e.g. a backup use case) and copying from remote to local (e.g. a restore use + * case). + */ +public abstract class AbstractContractDistCpTest + extends AbstractFSContractTestBase { + + @Rule + public TestName testName = new TestName(); + + private Configuration conf; + private FileSystem localFS, remoteFS; + private Path localDir, remoteDir; + + @Override + protected Configuration createConfiguration() { + Configuration newConf = new Configuration(); + newConf.set("mapred.job.tracker", "local"); + return newConf; + } + + @Before + @Override + public void setup() throws Exception { + super.setup(); + conf = getContract().getConf(); + localFS = FileSystem.getLocal(conf); + remoteFS = getFileSystem(); + // Test paths are isolated by concrete subclass name and test method name. + // All paths are fully qualified including scheme (not taking advantage of + // default file system), so if something fails, the messages will make it + // clear which paths are local and which paths are remote. + Path testSubDir = new Path(getClass().getSimpleName(), + testName.getMethodName()); + localDir = localFS.makeQualified(new Path(new Path( + GenericTestUtils.getTestDir().toURI()), testSubDir)); + mkdirs(localFS, localDir); + remoteDir = remoteFS.makeQualified( + new Path(getContract().getTestPath(), testSubDir)); + mkdirs(remoteFS, remoteDir); + } + + @Test + public void deepDirectoryStructureToRemote() throws Exception { + describe("copy a deep directory structure from local to remote"); + deepDirectoryStructure(localFS, localDir, remoteFS, remoteDir); + } + + @Test + public void largeFilesToRemote() throws Exception { + describe("copy multiple large files from local to remote"); + largeFiles(localFS, localDir, remoteFS, remoteDir); + } + + @Test + public void deepDirectoryStructureFromRemote() throws Exception { + describe("copy a deep directory structure from remote to local"); + deepDirectoryStructure(remoteFS, remoteDir, localFS, localDir); + } + + @Test + public void largeFilesFromRemote() throws Exception { + describe("copy multiple large files from remote to local"); + largeFiles(remoteFS, remoteDir, localFS, localDir); + } + + /** + * Executes a test using a file system sub-tree with multiple nesting levels. + * + * @param srcFS source FileSystem + * @param srcDir source directory + * @param dstFS destination FileSystem + * @param dstDir destination directory + * @throws Exception if there is a failure + */ + private void deepDirectoryStructure(FileSystem srcFS, Path srcDir, + FileSystem dstFS, Path dstDir) throws Exception { + Path inputDir = new Path(srcDir, "inputDir"); + Path inputSubDir1 = new Path(inputDir, "subDir1"); + Path inputSubDir2 = new Path(inputDir, "subDir2/subDir3"); + Path inputFile1 = new Path(inputDir, "file1"); + Path inputFile2 = new Path(inputSubDir1, "file2"); + Path inputFile3 = new Path(inputSubDir2, "file3"); + mkdirs(srcFS, inputSubDir1); + mkdirs(srcFS, inputSubDir2); + byte[] data1 = dataset(100, 33, 43); + createFile(srcFS, inputFile1, true, data1); + byte[] data2 = dataset(200, 43, 53); + createFile(srcFS, inputFile2, true, data2); + byte[] data3 = dataset(300, 53, 63); + createFile(srcFS, inputFile3, true, data3); + Path target = new Path(dstDir, "outputDir"); + runDistCp(inputDir, target); + ContractTestUtils.assertIsDirectory(dstFS, target); + verifyFileContents(dstFS, new Path(target, "inputDir/file1"), data1); + verifyFileContents(dstFS, + new Path(target, "inputDir/subDir1/file2"), data2); + verifyFileContents(dstFS, + new Path(target, "inputDir/subDir2/subDir3/file3"), data3); + } + + /** + * Executes a test using multiple large files. + * + * @param srcFS source FileSystem + * @param srcDir source directory + * @param dstFS destination FileSystem + * @param dstDir destination directory + * @throws Exception if there is a failure + */ + private void largeFiles(FileSystem srcFS, Path srcDir, FileSystem dstFS, + Path dstDir) throws Exception { + Path inputDir = new Path(srcDir, "inputDir"); + Path inputFile1 = new Path(inputDir, "file1"); + Path inputFile2 = new Path(inputDir, "file2"); + Path inputFile3 = new Path(inputDir, "file3"); + mkdirs(srcFS, inputDir); + int fileSizeKb = conf.getInt("scale.test.distcp.file.size.kb", 10 * 1024); + int fileSizeMb = fileSizeKb * 1024; + getLog().info("{} with file size {}", testName.getMethodName(), fileSizeMb); + byte[] data1 = dataset((fileSizeMb + 1) * 1024 * 1024, 33, 43); + createFile(srcFS, inputFile1, true, data1); + byte[] data2 = dataset((fileSizeMb + 2) * 1024 * 1024, 43, 53); + createFile(srcFS, inputFile2, true, data2); + byte[] data3 = dataset((fileSizeMb + 3) * 1024 * 1024, 53, 63); + createFile(srcFS, inputFile3, true, data3); + Path target = new Path(dstDir, "outputDir"); + runDistCp(inputDir, target); + ContractTestUtils.assertIsDirectory(dstFS, target); + verifyFileContents(dstFS, new Path(target, "inputDir/file1"), data1); + verifyFileContents(dstFS, new Path(target, "inputDir/file2"), data2); + verifyFileContents(dstFS, new Path(target, "inputDir/file3"), data3); + } + + /** + * Executes DistCp and asserts that the job finished successfully. + * + * @param src source path + * @param dst destination path + * @throws Exception if there is a failure + */ + private void runDistCp(Path src, Path dst) throws Exception { + DistCpOptions options = new DistCpOptions(Arrays.asList(src), dst); + Job job = new DistCp(conf, options).execute(); + assertNotNull("Unexpected null job returned from DistCp execution.", job); + assertTrue("DistCp job did not complete.", job.isComplete()); + assertTrue("DistCp job did not complete successfully.", job.isSuccessful()); + } + + /** + * Creates a directory and any ancestor directories required. + * + * @param fs FileSystem in which to create directories + * @param dir path of directory to create + * @throws Exception if there is a failure + */ + private static void mkdirs(FileSystem fs, Path dir) throws Exception { + assertTrue("Failed to mkdir " + dir, fs.mkdirs(dir)); + } +}