MAPREDUCE-7241. FileInputFormat listStatus with less memory footprint. Contributed by Zhihua Deng

This commit is contained in:
Jason Lowe 2020-04-01 07:45:42 -05:00
parent c162648aff
commit c613296dc8
4 changed files with 114 additions and 11 deletions

View File

@ -193,7 +193,8 @@ protected void addInputPathRecursively(List<FileStatus> result,
if (stat.isDirectory()) { if (stat.isDirectory()) {
addInputPathRecursively(result, fs, stat.getPath(), inputFilter); addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
} else { } else {
result.add(stat); result.add(org.apache.hadoop.mapreduce.lib.input.
FileInputFormat.shrinkStatus(stat));
} }
} }
} }
@ -290,7 +291,8 @@ private List<FileStatus> singleThreadedListStatus(JobConf job, Path[] dirs,
addInputPathRecursively(result, fs, stat.getPath(), addInputPathRecursively(result, fs, stat.getPath(),
inputFilter); inputFilter);
} else { } else {
result.add(stat); result.add(org.apache.hadoop.mapreduce.lib.input.
FileInputFormat.shrinkStatus(stat));
} }
} }
} }

View File

@ -259,7 +259,8 @@ public Result call() throws Exception {
if (recursive && stat.isDirectory()) { if (recursive && stat.isDirectory()) {
result.dirsNeedingRecursiveCalls.add(stat); result.dirsNeedingRecursiveCalls.add(stat);
} else { } else {
result.locatedFileStatuses.add(stat); result.locatedFileStatuses.add(org.apache.hadoop.mapreduce.lib.
input.FileInputFormat.shrinkStatus(stat));
} }
} }
} }

View File

@ -325,7 +325,7 @@ private List<FileStatus> singleThreadedListStatus(JobContext job, Path[] dirs,
addInputPathRecursively(result, fs, stat.getPath(), addInputPathRecursively(result, fs, stat.getPath(),
inputFilter); inputFilter);
} else { } else {
result.add(stat); result.add(shrinkStatus(stat));
} }
} }
} }
@ -364,13 +364,42 @@ protected void addInputPathRecursively(List<FileStatus> result,
if (stat.isDirectory()) { if (stat.isDirectory()) {
addInputPathRecursively(result, fs, stat.getPath(), inputFilter); addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
} else { } else {
result.add(stat); result.add(shrinkStatus(stat));
} }
} }
} }
} }
/**
* The HdfsBlockLocation includes a LocatedBlock which contains messages
* for issuing more detailed queries to datanodes about a block, but these
* messages are useless during job submission currently. This method tries
* to exclude the LocatedBlock from HdfsBlockLocation by creating a new
* BlockLocation from original, reshaping the LocatedFileStatus,
* allowing {@link #listStatus(JobContext)} to scan more files with less
* memory footprint.
* @see BlockLocation
* @see org.apache.hadoop.fs.HdfsBlockLocation
* @param origStat The fat FileStatus.
* @return The FileStatus that has been shrunk.
*/
public static FileStatus shrinkStatus(FileStatus origStat) {
if (origStat.isDirectory() || origStat.getLen() == 0 ||
!(origStat instanceof LocatedFileStatus)) {
return origStat;
} else {
BlockLocation[] blockLocations =
((LocatedFileStatus)origStat).getBlockLocations();
BlockLocation[] locs = new BlockLocation[blockLocations.length];
int i = 0;
for (BlockLocation location : blockLocations) {
locs[i++] = new BlockLocation(location);
}
LocatedFileStatus newStat = new LocatedFileStatus(origStat, locs);
return newStat;
}
}
/** /**
* A factory that makes the split for this class. It can be overridden * A factory that makes the split for this class. It can be overridden
* by sub-classes to make sub-types * by sub-classes to make sub-types

View File

@ -32,11 +32,17 @@
import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.HdfsBlockLocation;
import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.RawLocalFileSystem; import org.apache.hadoop.fs.RawLocalFileSystem;
import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
import org.apache.hadoop.mapred.SplitLocationInfo; import org.apache.hadoop.mapred.SplitLocationInfo;
import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Job;
@ -238,6 +244,50 @@ public void testListStatusErrorOnNonExistantDir() throws IOException {
} }
} }
@Test
public void testShrinkStatus() throws IOException {
Configuration conf = getConfiguration();
MockFileSystem mockFs =
(MockFileSystem) new Path("test:///").getFileSystem(conf);
Path dir1 = new Path("test:/a1");
RemoteIterator<LocatedFileStatus> statuses = mockFs.listLocatedStatus(dir1);
boolean verified = false;
while (statuses.hasNext()) {
LocatedFileStatus orig = statuses.next();
LocatedFileStatus shrink =
(LocatedFileStatus)FileInputFormat.shrinkStatus(orig);
Assert.assertTrue(orig.equals(shrink));
if (shrink.getBlockLocations() != null) {
Assert.assertEquals(orig.getBlockLocations().length,
shrink.getBlockLocations().length);
for (int i = 0; i < shrink.getBlockLocations().length; i++) {
verified = true;
BlockLocation location = shrink.getBlockLocations()[i];
BlockLocation actual = orig.getBlockLocations()[i];
Assert.assertNotNull(((HdfsBlockLocation)actual).getLocatedBlock());
Assert.assertEquals(BlockLocation.class.getName(),
location.getClass().getName());
Assert.assertArrayEquals(actual.getHosts(), location.getHosts());
Assert.assertArrayEquals(actual.getCachedHosts(),
location.getCachedHosts());
Assert.assertArrayEquals(actual.getStorageIds(),
location.getStorageIds());
Assert.assertArrayEquals(actual.getStorageTypes(),
location.getStorageTypes());
Assert.assertArrayEquals(actual.getTopologyPaths(),
location.getTopologyPaths());
Assert.assertArrayEquals(actual.getNames(), location.getNames());
Assert.assertEquals(actual.getLength(), location.getLength());
Assert.assertEquals(actual.getOffset(), location.getOffset());
Assert.assertEquals(actual.isCorrupt(), location.isCorrupt());
}
} else {
Assert.assertTrue(orig.getBlockLocations() == null);
}
}
Assert.assertTrue(verified);
}
public static List<Path> configureTestSimple(Configuration conf, FileSystem localFs) public static List<Path> configureTestSimple(Configuration conf, FileSystem localFs)
throws IOException { throws IOException {
Path base1 = new Path(TEST_ROOT_DIR, "input1"); Path base1 = new Path(TEST_ROOT_DIR, "input1");
@ -437,10 +487,31 @@ public FileStatus[] listStatus(Path f, PathFilter filter)
@Override @Override
public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len) public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len)
throws IOException { throws IOException {
return new BlockLocation[] { DatanodeInfo[] ds = new DatanodeInfo[2];
new BlockLocation(new String[] { "localhost:9866", "otherhost:9866" }, ds[0] = new DatanodeDescriptor(
new String[] { "localhost", "otherhost" }, new String[] { "localhost" }, new DatanodeID("127.0.0.1", "localhost", "abcd",
new String[0], 0, len, false) }; } 9866, 9867, 9868, 9869));
ds[1] = new DatanodeDescriptor(
new DatanodeID("1.0.0.1", "otherhost", "efgh",
9866, 9867, 9868, 9869));
long blockLen = len / 3;
ExtendedBlock b1 = new ExtendedBlock("bpid", 0, blockLen, 0);
ExtendedBlock b2 = new ExtendedBlock("bpid", 1, blockLen, 1);
ExtendedBlock b3 = new ExtendedBlock("bpid", 2, len - 2 * blockLen, 2);
String[] names = new String[]{ "localhost:9866", "otherhost:9866" };
String[] hosts = new String[]{ "localhost", "otherhost" };
String[] cachedHosts = {"localhost"};
BlockLocation loc1 = new BlockLocation(names, hosts, cachedHosts,
new String[0], 0, blockLen, false);
BlockLocation loc2 = new BlockLocation(names, hosts, cachedHosts,
new String[0], blockLen, blockLen, false);
BlockLocation loc3 = new BlockLocation(names, hosts, cachedHosts,
new String[0], 2 * blockLen, len - 2 * blockLen, false);
return new BlockLocation[]{
new HdfsBlockLocation(loc1, new LocatedBlock(b1, ds)),
new HdfsBlockLocation(loc2, new LocatedBlock(b2, ds)),
new HdfsBlockLocation(loc3, new LocatedBlock(b3, ds)) };
}
@Override @Override
protected RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f, protected RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f,