HADOOP-12657. Add a option to skip newline on empty files with getMerge -nl. Contributed by Kanaka Kumar Avvaru.

This commit is contained in:
Akira Ajisaka 2015-12-18 13:58:28 +09:00
parent bd5e207432
commit 061c05cc05
5 changed files with 55 additions and 20 deletions

View File

@ -688,6 +688,9 @@ Release 2.8.0 - UNRELEASED
HADOOP-10300. Allowed deferred sending of call responses. (Daryn Sharp via HADOOP-10300. Allowed deferred sending of call responses. (Daryn Sharp via
yliu) yliu)
HADOOP-12657. Add a option to skip newline on empty files with getMerge -nl.
(Kanaka Kumar Avvaru via aajisaka)
IMPROVEMENTS IMPROVEMENTS
HADOOP-12458. Retries is typoed to spell Retires in parts of HADOOP-12458. Retries is typoed to spell Retires in parts of

View File

@ -53,24 +53,29 @@ public static void registerCommands(CommandFactory factory) {
/** merge multiple files together */ /** merge multiple files together */
public static class Merge extends FsCommand { public static class Merge extends FsCommand {
public static final String NAME = "getmerge"; public static final String NAME = "getmerge";
public static final String USAGE = "[-nl] <src> <localdst>"; public static final String USAGE = "[-nl] [-skip-empty-file] "
+ "<src> <localdst>";
public static final String DESCRIPTION = public static final String DESCRIPTION =
"Get all the files in the directories that " + "Get all the files in the directories that "
"match the source file pattern and merge and sort them to only " + + "match the source file pattern and merge and sort them to only "
"one file on local fs. <src> is kept.\n" + + "one file on local fs. <src> is kept.\n"
"-nl: Add a newline character at the end of each file."; + "-nl: Add a newline character at the end of each file.\n"
+ "-skip-empty-file: Do not add new line character for empty file.";
protected PathData dst = null; protected PathData dst = null;
protected String delimiter = null; protected String delimiter = null;
private boolean skipEmptyFileDelimiter;
protected List<PathData> srcs = null; protected List<PathData> srcs = null;
@Override @Override
protected void processOptions(LinkedList<String> args) throws IOException { protected void processOptions(LinkedList<String> args) throws IOException {
try { try {
CommandFormat cf = new CommandFormat(2, Integer.MAX_VALUE, "nl"); CommandFormat cf = new CommandFormat(2, Integer.MAX_VALUE, "nl",
"skip-empty-file");
cf.parse(args); cf.parse(args);
delimiter = cf.getOpt("nl") ? "\n" : null; delimiter = cf.getOpt("nl") ? "\n" : null;
skipEmptyFileDelimiter = cf.getOpt("skip-empty-file");
dst = new PathData(new URI(args.removeLast()), getConf()); dst = new PathData(new URI(args.removeLast()), getConf());
if (dst.exists && dst.stat.isDirectory()) { if (dst.exists && dst.stat.isDirectory()) {
@ -92,21 +97,26 @@ protected void processArguments(LinkedList<PathData> items)
FSDataOutputStream out = dst.fs.create(dst.path); FSDataOutputStream out = dst.fs.create(dst.path);
try { try {
for (PathData src : srcs) { for (PathData src : srcs) {
FSDataInputStream in = src.fs.open(src.path); if (src.stat.getLen() != 0) {
try { try (FSDataInputStream in = src.fs.open(src.path)) {
IOUtils.copyBytes(in, out, getConf(), false); IOUtils.copyBytes(in, out, getConf(), false);
if (delimiter != null) { writeDelimiter(out);
out.write(delimiter.getBytes("UTF-8"));
} }
} finally { } else if (!skipEmptyFileDelimiter) {
in.close(); writeDelimiter(out);
} }
} }
} finally { } finally {
out.close(); out.close();
} }
} }
private void writeDelimiter(FSDataOutputStream out) throws IOException {
if (delimiter != null) {
out.write(delimiter.getBytes("UTF-8"));
}
}
@Override @Override
protected void processNonexistentPath(PathData item) throws IOException { protected void processNonexistentPath(PathData item) throws IOException {
exitCode = 1; // flag that a path is bad exitCode = 1; // flag that a path is bad

View File

@ -375,6 +375,7 @@ getmerge
Usage: `hadoop fs -getmerge [-nl] <src> <localdst>` Usage: `hadoop fs -getmerge [-nl] <src> <localdst>`
Takes a source directory and a destination file as input and concatenates files in src into the destination local file. Optionally -nl can be set to enable adding a newline character (LF) at the end of each file. Takes a source directory and a destination file as input and concatenates files in src into the destination local file. Optionally -nl can be set to enable adding a newline character (LF) at the end of each file.
-skip-empty-file can be used to avoid unwanted newline characters in case of empty files.
Examples: Examples:

View File

@ -318,6 +318,7 @@ public void testCopyMerge() throws Exception {
Path f1 = new Path(root, "f1"); Path f1 = new Path(root, "f1");
Path f2 = new Path(root, "f2"); Path f2 = new Path(root, "f2");
Path f3 = new Path(root, "f3"); Path f3 = new Path(root, "f3");
Path empty = new Path(root, "empty");
Path fnf = new Path(root, "fnf"); Path fnf = new Path(root, "fnf");
Path d = new Path(root, "dir"); Path d = new Path(root, "dir");
Path df1 = new Path(d, "df1"); Path df1 = new Path(d, "df1");
@ -325,7 +326,8 @@ public void testCopyMerge() throws Exception {
Path df3 = new Path(d, "df3"); Path df3 = new Path(d, "df3");
createFile(f1, f2, f3, df1, df2, df3); createFile(f1, f2, f3, df1, df2, df3);
createEmptyFile(empty);
int exit; int exit;
// one file, kind of silly // one file, kind of silly
exit = shell.run(new String[]{ exit = shell.run(new String[]{
@ -366,6 +368,13 @@ public void testCopyMerge() throws Exception {
assertEquals(0, exit); assertEquals(0, exit);
assertEquals("f1\nf2\n", readFile("out")); assertEquals("f1\nf2\n", readFile("out"));
exit = shell.run(new String[]{
"-getmerge", "-nl", "-skip-empty-file",
f1.toString(), f2.toString(), empty.toString(),
"out" });
assertEquals(0, exit);
assertEquals("f1\nf2\n", readFile("out"));
// glob three files // glob three files
shell.run(new String[]{ shell.run(new String[]{
"-getmerge", "-nl", "-getmerge", "-nl",
@ -374,13 +383,13 @@ public void testCopyMerge() throws Exception {
assertEquals(0, exit); assertEquals(0, exit);
assertEquals("f1\nf2\nf3\n", readFile("out")); assertEquals("f1\nf2\nf3\n", readFile("out"));
// directory with 3 files, should skip subdir // directory with 1 empty + 3 non empty files, should skip subdir
shell.run(new String[]{ shell.run(new String[]{
"-getmerge", "-nl", "-getmerge", "-nl",
root.toString(), root.toString(),
"out" }); "out" });
assertEquals(0, exit); assertEquals(0, exit);
assertEquals("f1\nf2\nf3\n", readFile("out")); assertEquals("\nf1\nf2\nf3\n", readFile("out"));
// subdir // subdir
shell.run(new String[]{ shell.run(new String[]{
@ -538,7 +547,14 @@ private void createFile(Path ... paths) throws IOException {
out.close(); out.close();
} }
} }
private void createEmptyFile(Path ... paths) throws IOException {
for (Path path : paths) {
FSDataOutputStream out = lfs.create(path);
out.close();
}
}
private String readFile(String out) throws IOException { private String readFile(String out) throws IOException {
Path path = new Path(out); Path path = new Path(out);
FileStatus stat = lfs.getFileStatus(path); FileStatus stat = lfs.getFileStatus(path);

View File

@ -601,7 +601,7 @@
<comparators> <comparators>
<comparator> <comparator>
<type>RegexpComparator</type> <type>RegexpComparator</type>
<expected-output>^-getmerge \[-nl\] &lt;src&gt; &lt;localdst&gt; :\s*</expected-output> <expected-output>^-getmerge \[-nl\] \[-skip-empty-file\] &lt;src&gt; &lt;localdst&gt; :\s*</expected-output>
</comparator> </comparator>
<comparator> <comparator>
<type>RegexpComparator</type> <type>RegexpComparator</type>
@ -615,6 +615,11 @@
<type>RegexpComparator</type> <type>RegexpComparator</type>
<expected-output>^( |\t)*-nl\s+Add a newline character at the end of each file.( )*</expected-output> <expected-output>^( |\t)*-nl\s+Add a newline character at the end of each file.( )*</expected-output>
</comparator> </comparator>
<comparator>
<type>RegexpComparator</type>
<expected-output>^( |\t)*-skip-empty-file\s+Do not add new line character for empty file.( )*</expected-output>
</comparator>
</comparators> </comparators>
</test> </test>