From f8904ad299bbcd109e3460f9b8ab9fbb9cebdad4 Mon Sep 17 00:00:00 2001 From: Chris Nauroth Date: Fri, 11 Apr 2014 03:48:29 +0000 Subject: [PATCH] HDFS-6231. DFSClient hangs infinitely if using hedged reads and all eligible datanodes die. Contributed by Chris Nauroth. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1586551 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 +++ .../apache/hadoop/hdfs/DFSInputStream.java | 20 ++++++++++--------- .../org/apache/hadoop/hdfs/TestPread.java | 6 +++++- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index b62f05fcc1..ecf918092f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -372,6 +372,9 @@ Release 2.4.1 - UNRELEASED HDFS-6208. DataNode caching can leak file descriptors. (cnauroth) + HDFS-6231. DFSClient hangs infinitely if using hedged reads and all eligible + datanodes die. (cnauroth) + Release 2.4.0 - 2014-04-07 INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java index 756ef4b60d..301bd4c15a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java @@ -983,12 +983,15 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead, return new Callable() { @Override public ByteBuffer call() throws Exception { - byte[] buf = bb.array(); - int offset = bb.position(); - actualGetFromOneDataNode(datanode, block, start, end, buf, offset, - corruptedBlockMap); - latch.countDown(); - return bb; + try { + byte[] buf = bb.array(); + int offset = bb.position(); + actualGetFromOneDataNode(datanode, block, start, end, buf, offset, + corruptedBlockMap); + return bb; + } finally { + latch.countDown(); + } } }; } @@ -1101,7 +1104,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead, long end, byte[] buf, int offset, Map> corruptedBlockMap) throws IOException { - ArrayList> futures = null; + ArrayList> futures = new ArrayList>(); ArrayList ignored = new ArrayList(); ByteBuffer bb = null; int len = (int) (end - start + 1); @@ -1112,7 +1115,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead, DNAddrPair chosenNode = null; Future future = null; // futures is null if there is no request already executing. - if (futures == null) { + if (futures.isEmpty()) { // chooseDataNode is a commitment. If no node, we go to // the NN to reget block locations. Only go here on first read. chosenNode = chooseDataNode(block, ignored); @@ -1130,7 +1133,6 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead, // Ignore this node on next go around. ignored.add(chosenNode.info); dfsClient.getHedgedReadMetrics().incHedgedReadOps(); - futures = new ArrayList>(); futures.add(future); continue; // no need to refresh block locations } catch (InterruptedException e) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPread.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPread.java index 80b8144b6b..65eaf67bcb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPread.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPread.java @@ -237,7 +237,7 @@ public class TestPread { public void testHedgedPreadDFSBasic() throws IOException { Configuration conf = new Configuration(); conf.setInt(DFSConfigKeys.DFS_DFSCLIENT_HEDGED_READ_THREADPOOL_SIZE, 5); - conf.setLong(DFSConfigKeys.DFS_DFSCLIENT_HEDGED_READ_THRESHOLD_MILLIS, 100); + conf.setLong(DFSConfigKeys.DFS_DFSCLIENT_HEDGED_READ_THRESHOLD_MILLIS, 1); dfsPreadTest(conf, false, true); // normal pread dfsPreadTest(conf, true, true); // trigger read code path without // transferTo. @@ -273,6 +273,10 @@ public class TestPread { DistributedFileSystem fileSys = cluster.getFileSystem(); DFSClient dfsClient = fileSys.getClient(); DFSHedgedReadMetrics metrics = dfsClient.getHedgedReadMetrics(); + // Metrics instance is static, so we need to reset counts from prior tests. + metrics.hedgedReadOps.set(0); + metrics.hedgedReadOpsWin.set(0); + metrics.hedgedReadOpsInCurThread.set(0); try { Path file1 = new Path("hedgedReadMaxOut.dat");