From ebe1d1fbf7a7549078e5a468080513db09b6416f Mon Sep 17 00:00:00 2001 From: Ahmed Hussein <50450311+amahussein@users.noreply.github.com> Date: Fri, 13 Nov 2020 14:22:35 -0600 Subject: [PATCH] HADOOP-17362. reduce RPC calls doing ls on HAR file (#2444). Contributed by Daryn Sharp and Ahmed Hussein --- .../org/apache/hadoop/fs/HarFileSystem.java | 70 +++++++++---------- .../apache/hadoop/fs/TestHarFileSystem.java | 1 - .../hadoop/fs/TestHarFileSystemBasics.java | 5 +- 3 files changed, 37 insertions(+), 39 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java index 02c0916e54..7e12d0a11e 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java @@ -35,6 +35,7 @@ import java.net.URISyntaxException; import java.net.URLDecoder; import java.util.*; +import java.util.concurrent.ConcurrentHashMap; import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs; @@ -513,41 +514,22 @@ private void fileStatusesInIndex(HarStatus parent, List statuses) if (!parentString.endsWith(Path.SEPARATOR)){ parentString += Path.SEPARATOR; } - Path harPath = new Path(parentString); - int harlen = harPath.depth(); - final Map cache = new TreeMap(); - for (HarStatus hstatus : metadata.archive.values()) { - String child = hstatus.getName(); - if ((child.startsWith(parentString))) { - Path thisPath = new Path(child); - if (thisPath.depth() == harlen + 1) { - statuses.add(toFileStatus(hstatus, cache)); - } - } + for (String child: parent.children) { + Path p = new Path(parentString + child); + statuses.add(toFileStatus(metadata.archive.get(p))); } } /** * Combine the status stored in the index and the underlying status. * @param h status stored in the index - * @param cache caching the underlying file statuses * @return the combined file status * @throws IOException */ - private FileStatus toFileStatus(HarStatus h, - Map cache) throws IOException { - FileStatus underlying = null; - if (cache != null) { - underlying = cache.get(h.partName); - } - if (underlying == null) { - final Path p = h.isDir? archivePath: new Path(archivePath, h.partName); - underlying = fs.getFileStatus(p); - if (cache != null) { - cache.put(h.partName, underlying); - } - } + private FileStatus toFileStatus(HarStatus h) throws IOException { + final Path p = h.isDir ? archivePath : new Path(archivePath, h.partName); + FileStatus underlying = metadata.getPartFileStatus(p); long modTime = 0; int version = metadata.getVersion(); @@ -658,7 +640,7 @@ public long getModificationTime() { @Override public FileStatus getFileStatus(Path f) throws IOException { HarStatus hstatus = getFileHarStatus(f); - return toFileStatus(hstatus, null); + return toFileStatus(hstatus); } private HarStatus getFileHarStatus(Path f) throws IOException { @@ -815,7 +797,7 @@ public FileStatus[] listStatus(Path f) throws IOException { if (hstatus.isDir()) { fileStatusesInIndex(hstatus, statuses); } else { - statuses.add(toFileStatus(hstatus, null)); + statuses.add(toFileStatus(hstatus)); } return statuses.toArray(new FileStatus[statuses.size()]); @@ -1143,7 +1125,8 @@ private class HarMetaData { List stores = new ArrayList(); Map archive = new HashMap(); - private Map partFileStatuses = new HashMap(); + // keys are always the internal har path. + private Map partFileStatuses = new ConcurrentHashMap<>(); public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) { this.fs = fs; @@ -1151,16 +1134,23 @@ public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) { this.archiveIndexPath = archiveIndexPath; } - public FileStatus getPartFileStatus(Path partPath) throws IOException { + public FileStatus getPartFileStatus(Path path) throws IOException { + Path partPath = getPathInHar(path); FileStatus status; status = partFileStatuses.get(partPath); if (status == null) { - status = fs.getFileStatus(partPath); + status = fs.getFileStatus(path); partFileStatuses.put(partPath, status); } return status; } + private void addPartFileStatuses(Path path) throws IOException { + for (FileStatus stat : fs.listStatus(path)) { + partFileStatuses.put(getPathInHar(stat.getPath()), stat); + } + } + public long getMasterIndexTimestamp() { return masterIndexTimestamp; } @@ -1217,16 +1207,22 @@ private void parseMetaData() throws IOException { try { FileStatus archiveStat = fs.getFileStatus(archiveIndexPath); archiveIndexTimestamp = archiveStat.getModificationTime(); - LineReader aLin; + + // pre-populate part cache. + addPartFileStatuses(archiveIndexPath.getParent()); + LineReader aLin = null; // now start reading the real index file + long pos = -1; for (Store s: stores) { - read = 0; - aIn.seek(s.begin); - aLin = new LineReader(aIn, getConf()); - while (read + s.begin < s.end) { - int tmp = aLin.readLine(line); - read += tmp; + if (pos != s.begin) { + pos = s.begin; + aIn.seek(s.begin); + aLin = new LineReader(aIn, getConf()); + } + + while (pos < s.end) { + pos += aLin.readLine(line); String lineFeed = line.toString(); String[] parsed = lineFeed.split(" "); parsed[0] = decodeFileName(parsed[0]); diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java index 8050ce6b44..711ab94fdf 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java @@ -41,7 +41,6 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.concurrent.CompletableFuture; import static org.apache.hadoop.fs.Options.ChecksumOpt; diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystemBasics.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystemBasics.java index c58e731b82..6415df6310 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystemBasics.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystemBasics.java @@ -33,7 +33,10 @@ import java.util.HashSet; import java.util.Set; -import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + /** * This test class checks basic operations with {@link HarFileSystem} including