From a4f459097b068516702a1edfc1b72384f6fd37ee Mon Sep 17 00:00:00 2001 From: Mohanad Elsafty Date: Fri, 18 Feb 2022 19:12:50 +0800 Subject: [PATCH] HADOOP-18117. Add an option to preserve root directory permissions (#3970) --- .../apache/hadoop/tools/DistCpConstants.java | 2 + .../apache/hadoop/tools/DistCpContext.java | 4 ++ .../hadoop/tools/DistCpOptionSwitch.java | 6 ++- .../apache/hadoop/tools/DistCpOptions.java | 19 ++++++++ .../apache/hadoop/tools/OptionsParser.java | 4 +- .../hadoop/tools/SimpleCopyListing.java | 6 ++- .../hadoop/tools/mapred/CopyCommitter.java | 10 ++++- .../src/site/markdown/DistCp.md.vm | 1 + .../hadoop/tools/TestDistCpOptions.java | 13 +++++- .../apache/hadoop/tools/TestDistCpSystem.java | 43 +++++++++++++++++++ .../hadoop/tools/TestOptionsParser.java | 14 ++++++ 11 files changed, 115 insertions(+), 7 deletions(-) diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java index c75c0e85dd..289d552b86 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java @@ -86,6 +86,8 @@ private DistCpConstants() { public static final String CONF_LABEL_SPLIT_RATIO = "distcp.dynamic.split.ratio"; public static final String CONF_LABEL_DIRECT_WRITE = "distcp.direct.write"; + public static final String CONF_LABEL_UPDATE_ROOT = + "distcp.update.root.attributes"; /* Total bytes to be copied. Updated by copylisting. Unfiltered count */ public static final String CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED = "mapred.total.bytes.expected"; diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpContext.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpContext.java index 8e9d64a376..8c443f66a0 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpContext.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpContext.java @@ -175,6 +175,10 @@ public boolean shouldUseIterator() { return options.shouldUseIterator(); } + public boolean shouldUpdateRoot() { + return options.shouldUpdateRoot(); + } + public final boolean splitLargeFile() { return options.getBlocksPerChunk() > 0; } diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java index 4163f8274d..becd537f68 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java @@ -244,8 +244,12 @@ public enum DistCpOptionSwitch { USE_ITERATOR(DistCpConstants.CONF_LABEL_USE_ITERATOR, new Option("useiterator", false, "Use single threaded list status iterator to build " - + "the listing to save the memory utilisation at the client")); + + "the listing to save the memory utilisation at the client")), + UPDATE_ROOT(DistCpConstants.CONF_LABEL_UPDATE_ROOT, + new Option("updateRoot", false, + "Update root directory attributes " + + "(eg permissions, ownership ...)")); public static final String PRESERVE_STATUS_DEFAULT = "-prbugpct"; private final String confLabel; diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java index fee53ec1d6..e5685c1492 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java @@ -162,6 +162,8 @@ public final class DistCpOptions { private final boolean useIterator; + private final boolean updateRoot; + /** * File attributes for preserve. * @@ -228,6 +230,8 @@ private DistCpOptions(Builder builder) { this.directWrite = builder.directWrite; this.useIterator = builder.useIterator; + + this.updateRoot = builder.updateRoot; } public Path getSourceFileListing() { @@ -374,6 +378,10 @@ public boolean shouldUseIterator() { return useIterator; } + public boolean shouldUpdateRoot() { + return updateRoot; + } + /** * Add options to configuration. These will be used in the Mapper/committer * @@ -427,6 +435,9 @@ public void appendToConf(Configuration conf) { DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.USE_ITERATOR, String.valueOf(useIterator)); + + DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.UPDATE_ROOT, + String.valueOf(updateRoot)); } /** @@ -465,6 +476,7 @@ public String toString() { ", verboseLog=" + verboseLog + ", directWrite=" + directWrite + ", useiterator=" + useIterator + + ", updateRoot=" + updateRoot + '}'; } @@ -518,6 +530,8 @@ public static class Builder { private boolean useIterator = false; + private boolean updateRoot = false; + public Builder(List sourcePaths, Path targetPath) { Preconditions.checkArgument(sourcePaths != null && !sourcePaths.isEmpty(), "Source paths should not be null or empty!"); @@ -780,6 +794,11 @@ public Builder withUseIterator(boolean useItr) { this.useIterator = useItr; return this; } + + public Builder withUpdateRoot(boolean updateRootAttrs) { + this.updateRoot = updateRootAttrs; + return this; + } } } diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java index 476b57b995..f2875c5f33 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java @@ -117,7 +117,9 @@ public static DistCpOptions parse(String[] args) .withDirectWrite( command.hasOption(DistCpOptionSwitch.DIRECT_WRITE.getSwitch())) .withUseIterator( - command.hasOption(DistCpOptionSwitch.USE_ITERATOR.getSwitch())); + command.hasOption(DistCpOptionSwitch.USE_ITERATOR.getSwitch())) + .withUpdateRoot( + command.hasOption(DistCpOptionSwitch.UPDATE_ROOT.getSwitch())); if (command.hasOption(DistCpOptionSwitch.DIFF.getSwitch())) { String[] snapshots = getVals(command, diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java index e9eb4c0923..c3f097da4b 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java @@ -616,10 +616,12 @@ private void writeToFileListingRoot(SequenceFile.Writer fileListWriter, DistCpContext context) throws IOException { boolean syncOrOverwrite = context.shouldSyncFolder() || context.shouldOverwrite(); + boolean skipRootPath = syncOrOverwrite && !context.shouldUpdateRoot(); for (CopyListingFileStatus fs : fileStatus) { if (fs.getPath().equals(sourcePathRoot) && - fs.isDirectory() && syncOrOverwrite) { - // Skip the root-paths when syncOrOverwrite + fs.isDirectory() && skipRootPath) { + // Skip the root-paths when skipRootPath (syncOrOverwrite and + // update root directory is not a must). LOG.debug("Skip {}", fs.getPath()); return; } diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java index 2272781f72..8c2bc82d3f 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java @@ -75,6 +75,7 @@ public class CopyCommitter extends FileOutputCommitter { private boolean ignoreFailures = false; private boolean skipCrc = false; private int blocksPerChunk = 0; + private boolean updateRoot = false; /** * Create a output committer @@ -100,6 +101,8 @@ public void commitJob(JobContext jobContext) throws IOException { Configuration conf = jobContext.getConfiguration(); syncFolder = conf.getBoolean(DistCpConstants.CONF_LABEL_SYNC_FOLDERS, false); overwrite = conf.getBoolean(DistCpConstants.CONF_LABEL_OVERWRITE, false); + updateRoot = + conf.getBoolean(CONF_LABEL_UPDATE_ROOT, false); targetPathExists = conf.getBoolean( DistCpConstants.CONF_LABEL_TARGET_PATH_EXISTS, true); ignoreFailures = conf.getBoolean( @@ -336,9 +339,12 @@ private void preserveFileAttributesForDirectories(Configuration conf) Path targetFile = new Path(targetRoot.toString() + "/" + srcRelPath); // - // Skip the root folder when syncOrOverwrite is true. + // Skip the root folder when skipRoot is true. // - if (targetRoot.equals(targetFile) && syncOrOverwrite) continue; + boolean skipRoot = syncOrOverwrite && !updateRoot; + if (targetRoot.equals(targetFile) && skipRoot) { + continue; + } FileSystem targetFS = targetFile.getFileSystem(conf); DistCpUtils.preserve(targetFS, targetFile, srcFileStatus, attributes, diff --git a/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm b/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm index 136b6c8ca1..030f20c23b 100644 --- a/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm +++ b/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm @@ -363,6 +363,7 @@ Command Line Options | `-xtrack ` | Save information about missing source files to the specified path. | This option is only valid with `-update` option. This is an experimental property and it cannot be used with `-atomic` option. | | `-direct` | Write directly to destination paths | Useful for avoiding potentially very expensive temporary file rename operations when the destination is an object store | | `-useiterator` | Uses single threaded listStatusIterator to build listing | Useful for saving memory at the client side. Using this option will ignore the numListstatusThreads option | +| `-updateRoot` | Update root directory attributes (eg permissions, ownership ...) | Useful if you need to enforce root directory attributes update when using distcp | Architecture of DistCp ---------------------- diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpOptions.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpOptions.java index 13497029a0..ac2100ec98 100644 --- a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpOptions.java +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpOptions.java @@ -289,7 +289,7 @@ public void testToString() { "atomicWorkPath=null, logPath=null, sourceFileListing=abc, " + "sourcePaths=null, targetPath=xyz, filtersFile='null', " + "blocksPerChunk=0, copyBufferSize=8192, verboseLog=false, " + - "directWrite=false, useiterator=false}"; + "directWrite=false, useiterator=false, updateRoot=false}"; String optionString = option.toString(); Assert.assertEquals(val, optionString); Assert.assertNotSame(DistCpOptionSwitch.ATOMIC_COMMIT.toString(), @@ -563,4 +563,15 @@ public void testAppendToConf() { "otherwise it may not be fetched properly", expectedValForEmptyConfigKey, config.get("")); } + + @Test + public void testUpdateRoot() { + final DistCpOptions options = new DistCpOptions.Builder( + Collections.singletonList( + new Path("hdfs://localhost:8020/source")), + new Path("hdfs://localhost:8020/target/")) + .withUpdateRoot(true) + .build(); + Assert.assertTrue(options.shouldUpdateRoot()); + } } diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpSystem.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpSystem.java index 47b850f4ba..64c6800f94 100644 --- a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpSystem.java +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpSystem.java @@ -21,6 +21,7 @@ import static org.apache.hadoop.test.GenericTestUtils.getMethodName; import static org.hamcrest.core.Is.is; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; import java.io.ByteArrayOutputStream; @@ -44,6 +45,8 @@ import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.client.HdfsClientConfigKeys; +import org.apache.hadoop.test.GenericTestUtils; +import org.apache.hadoop.tools.util.DistCpTestUtils; import org.apache.hadoop.util.ToolRunner; import org.junit.AfterClass; import org.junit.Assert; @@ -551,4 +554,44 @@ public void testSourceRoot() throws Exception { String[] args2 = new String[]{rootStr, tgtStr2}; Assert.assertThat(ToolRunner.run(conf, new DistCp(), args2), is(0)); } + + @Test + public void testUpdateRoot() throws Exception { + FileSystem fs = cluster.getFileSystem(); + + Path source = new Path("/src"); + Path dest1 = new Path("/dest1"); + Path dest2 = new Path("/dest2"); + + fs.delete(source, true); + fs.delete(dest1, true); + fs.delete(dest2, true); + + // Create a source dir + fs.mkdirs(source); + fs.setOwner(source, "userA", "groupA"); + fs.setTimes(source, new Random().nextLong(), new Random().nextLong()); + + GenericTestUtils.createFiles(fs, source, 3, 5, 5); + + // should not preserve attrs + DistCpTestUtils.assertRunDistCp(DistCpConstants.SUCCESS, source.toString(), + dest1.toString(), "-p -update", conf); + + FileStatus srcStatus = fs.getFileStatus(source); + FileStatus destStatus1 = fs.getFileStatus(dest1); + assertNotEquals(srcStatus.getOwner(), destStatus1.getOwner()); + assertNotEquals(srcStatus.getModificationTime(), + destStatus1.getModificationTime()); + + // should preserve attrs + DistCpTestUtils.assertRunDistCp(DistCpConstants.SUCCESS, source.toString(), + dest2.toString(), "-p -update -updateRoot", + conf); + + FileStatus destStatus2 = fs.getFileStatus(dest2); + assertEquals(srcStatus.getOwner(), destStatus2.getOwner()); + assertEquals(srcStatus.getModificationTime(), + destStatus2.getModificationTime()); + } } \ No newline at end of file diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestOptionsParser.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestOptionsParser.java index 85b312a94b..1ffdd89073 100644 --- a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestOptionsParser.java +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestOptionsParser.java @@ -804,4 +804,18 @@ public void testExclusionsOption() { "hdfs://localhost:8020/target/"}); assertThat(options.getFiltersFile()).isEqualTo("/tmp/filters.txt"); } + + @Test + public void testParseUpdateRoot() { + DistCpOptions options = OptionsParser.parse(new String[] { + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertFalse(options.shouldUpdateRoot()); + + options = OptionsParser.parse(new String[] { + "-updateRoot", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertTrue(options.shouldUpdateRoot()); + } }