HADOOP-11021. Configurable replication factor in the hadoop archive command. Contributed by Zhe Zhang.

This commit is contained in:
Andrew Wang 2014-08-29 14:44:37 -07:00
parent c60da4d3b3
commit ea1c6f31c2
3 changed files with 59 additions and 6 deletions

View File

@ -38,7 +38,7 @@ Overview
How to Create an Archive
------------------------
`Usage: hadoop archive -archiveName name -p <parent> <src>* <dest>`
`Usage: hadoop archive -archiveName name -p <parent> [-r <replication factor>] <src>* <dest>`
-archiveName is the name of the archive you would like to create. An example
would be foo.har. The name should have a \*.har extension. The parent argument
@ -52,9 +52,12 @@ How to Create an Archive
would need a map reduce cluster to run this. For a detailed example the later
sections.
-r indicates the desired replication factor; if this optional argument is
not specified, a replication factor of 10 will be used.
If you just want to archive a single directory /foo/bar then you can just use
`hadoop archive -archiveName zoo.har -p /foo/bar /outputdir`
`hadoop archive -archiveName zoo.har -p /foo/bar -r 3 /outputdir`
How to Look Up Files in Archives
--------------------------------
@ -90,14 +93,15 @@ Archives Examples
$H3 Creating an Archive
`hadoop archive -archiveName foo.har -p /user/hadoop dir1 dir2 /user/zoo`
`hadoop archive -archiveName foo.har -p /user/hadoop -r 3 dir1 dir2 /user/zoo`
The above example is creating an archive using /user/hadoop as the relative
archive directory. The directories /user/hadoop/dir1 and /user/hadoop/dir2
will be archived in the following file system directory -- /user/zoo/foo.har.
Archiving does not delete the input files. If you want to delete the input
files after creating the archives (to reduce namespace), you will have to do
it on your own.
it on your own. In this example, because `-r 3` is specified, a replication
factor of 3 will be used.
$H3 Looking Up Files

View File

@ -97,9 +97,12 @@ public class HadoopArchives implements Tool {
long partSize = 2 * 1024 * 1024 * 1024l;
/** size of blocks in hadoop archives **/
long blockSize = 512 * 1024 * 1024l;
/** the desired replication degree; default is 10 **/
short repl = 10;
private static final String usage = "archive"
+ " -archiveName NAME -p <parent path> <src>* <dest>" +
+ " -archiveName NAME -p <parent path> [-r <replication factor>]" +
"<src>* <dest>" +
"\n";
@ -542,7 +545,7 @@ void archive(Path parentPath, List<Path> srcPaths,
srcWriter.close();
}
//increase the replication of src files
jobfs.setReplication(srcFiles, (short) 10);
jobfs.setReplication(srcFiles, repl);
conf.setInt(SRC_COUNT_LABEL, numFiles);
conf.setLong(TOTAL_SIZE_LABEL, totalSize);
int numMaps = (int)(totalSize/partSize);
@ -835,6 +838,11 @@ public int run(String[] args) throws Exception {
}
i+=2;
if ("-r".equals(args[i])) {
repl = Short.parseShort(args[i+1]);
i+=2;
}
//read the rest of the paths
for (; i < args.length; i++) {
if (i == (args.length - 1)) {

View File

@ -157,6 +157,24 @@ public void testRelativePath() throws Exception {
final List<String> harPaths = lsr(shell, fullHarPathStr);
Assert.assertEquals(originalPaths, harPaths);
}
@Test
public void testRelativePathWitRepl() throws Exception {
final Path sub1 = new Path(inputPath, "dir1");
fs.mkdirs(sub1);
createFile(inputPath, fs, sub1.getName(), "a");
final FsShell shell = new FsShell(conf);
final List<String> originalPaths = lsr(shell, "input");
System.out.println("originalPaths: " + originalPaths);
// make the archive:
final String fullHarPathStr = makeArchiveWithRepl();
// compare results:
final List<String> harPaths = lsr(shell, fullHarPathStr);
Assert.assertEquals(originalPaths, harPaths);
}
@Test
public void testPathWithSpaces() throws Exception {
@ -625,6 +643,29 @@ private String makeArchive() throws Exception {
assertEquals(0, ToolRunner.run(har, args));
return fullHarPathStr;
}
/*
* Run the HadoopArchives tool to create an archive on the
* given file system with a specified replication degree.
*/
private String makeArchiveWithRepl() throws Exception {
final String inputPathStr = inputPath.toUri().getPath();
System.out.println("inputPathStr = " + inputPathStr);
final URI uri = fs.getUri();
final String prefix = "har://hdfs-" + uri.getHost() + ":" + uri.getPort()
+ archivePath.toUri().getPath() + Path.SEPARATOR;
final String harName = "foo.har";
final String fullHarPathStr = prefix + harName;
final String[] args = { "-archiveName", harName, "-p", inputPathStr,
"-r 3", "*", archivePath.toString() };
System.setProperty(HadoopArchives.TEST_HADOOP_ARCHIVES_JAR_PATH,
HADOOP_ARCHIVES_JAR);
final HadoopArchives har = new HadoopArchives(conf);
assertEquals(0, ToolRunner.run(har, args));
return fullHarPathStr;
}
@Test
/*