HADOOP-6467. Improve the performance on HarFileSystem.listStatus(..). Contributed by mahadev

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@915168 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tsz-wo Sze 2010-02-23 03:54:14 +00:00
parent c5622e5d4d
commit 4eedc77275
2 changed files with 68 additions and 23 deletions

View File

@ -163,6 +163,9 @@ Trunk (unreleased changes)
OPTIMIZATIONS
HADOOP-6467. Improve the performance on HarFileSystem.listStatus(..).
(mahadev via szetszwo)
BUG FIXES
HADOOP-6293. Fix FsShell -text to work on filesystems other than the

View File

@ -325,25 +325,12 @@ public Path makeQualified(Path path) {
@Override
public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
long len) throws IOException {
// need to look up the file in the underlying fs
// look up the index
// make sure this is a prt of this har filesystem
Path p = makeQualified(file.getPath());
Path harPath = getPathInHar(p);
String line = fileStatusInIndex(harPath);
if (line == null) {
throw new FileNotFoundException("File " + file.getPath() + " not found");
}
HarStatus harStatus = new HarStatus(line);
if (harStatus.isDir()) {
return new BlockLocation[0];
}
FileStatus fsFile = fs.getFileStatus(new Path(archivePath,
harStatus.getPartName()));
BlockLocation[] rawBlocks = fs.getFileBlockLocations(fsFile,
harStatus.getStartIndex() + start, len);
return fakeBlockLocations(rawBlocks, harStatus.getStartIndex());
// just fake block locations
// its fast and simpler
// doing various block location manipulation
// with part files adds a lot of overhead because
// of the look ups of filestatus in index files
return new BlockLocation[]{ new BlockLocation() };
}
/**
@ -387,6 +374,63 @@ public Store(long begin, long end, int startHash, int endHash) {
public int endHash;
}
/**
* Get filestatuses of all the children of a given directory. This just reads
* through index file and reads line by line to get all statuses for children
* of a directory. Its a brute force way of getting all such filestatuses
*
* @param parent
* the parent path directory
* @param statuses
* the list to add the children filestatuses to
* @param children
* the string list of children for this parent
* @param archiveIndexStat
* the archive index filestatus
*/
private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
List<String> children, FileStatus archiveIndexStat) throws IOException {
// read the index file
FSDataInputStream aIn = null;
try {
aIn = fs.open(archiveIndex);
LineReader aLin;
long read = 0;
aLin = new LineReader(aIn, getConf());
String parentString = parent.getName();
Path harPath = new Path(parentString);
int harlen = harPath.depth();
Text line = new Text();
while (read < archiveIndexStat.getLen()) {
int tmp = aLin.readLine(line);
read += tmp;
String lineFeed = line.toString();
String child = lineFeed.substring(0, lineFeed.indexOf(" "));
if ((child.startsWith(parentString))) {
Path thisPath = new Path(child);
if (thisPath.depth() == harlen + 1) {
// bingo!
HarStatus hstatus = new HarStatus(lineFeed);
FileStatus childStatus = new FileStatus(hstatus.isDir() ? 0
: hstatus.getLength(), hstatus.isDir(), (int) archiveIndexStat
.getReplication(), archiveIndexStat.getBlockSize(),
archiveIndexStat.getModificationTime(), archiveIndexStat
.getAccessTime(), new FsPermission(archiveIndexStat
.getPermission()), archiveIndexStat.getOwner(),
archiveIndexStat.getGroup(), makeRelative(this.uri.toString(),
new Path(hstatus.name)));
statuses.add(childStatus);
}
line.clear();
}
}
} finally {
if (aIn != null) {
aIn.close();
}
}
}
// make sure that this harPath is relative to the har filesystem
// this only works for relative paths. This returns the line matching
// the file in the index. Returns a null if there is not matching
@ -650,10 +694,8 @@ public FileStatus[] listStatus(Path f) throws IOException {
archiveStatus.getOwner(), archiveStatus.getGroup(),
makeRelative(this.uri.toString(), new Path(hstatus.name))));
else
for (String child: hstatus.children) {
FileStatus tmp = getFileStatus(new Path(tmpPath, child));
statuses.add(tmp);
}
fileStatusesInIndex(hstatus, statuses, hstatus.children, archiveStatus);
return statuses.toArray(new FileStatus[statuses.size()]);
}