diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 4c6cf2e363..1fb5c1c190 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -310,6 +310,9 @@ Release 2.4.0 - UNRELEASED HDFS-5843. DFSClient.getFileChecksum() throws IOException if checksum is disabled. (Laurent Goujon via jing9) + HDFS-5856. DataNode.checkDiskError might throw NPE. + (Josh Elser via suresh) + Release 2.3.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java index 0e0d79b28b..6bf98b6946 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java @@ -36,6 +36,7 @@ import java.net.URI; import java.net.UnknownHostException; import java.nio.channels.ClosedByInterruptException; +import java.nio.channels.ClosedChannelException; import java.nio.channels.SocketChannel; import java.security.PrivilegedExceptionAction; import java.util.ArrayList; @@ -51,7 +52,6 @@ import javax.management.ObjectName; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; @@ -1324,12 +1324,7 @@ public void shutdown() { protected void checkDiskError(Exception e ) throws IOException { LOG.warn("checkDiskError: exception: ", e); - if (e instanceof SocketException || e instanceof SocketTimeoutException - || e instanceof ClosedByInterruptException - || e.getMessage().startsWith("An established connection was aborted") - || e.getMessage().startsWith("Broken pipe") - || e.getMessage().startsWith("Connection reset") - || e.getMessage().contains("java.nio.channels.SocketChannel")) { + if (isNetworkRelatedException(e)) { LOG.info("Not checking disk as checkDiskError was called on a network" + " related exception"); return; @@ -1342,6 +1337,28 @@ protected void checkDiskError(Exception e ) throws IOException { } } + /** + * Check if the provided exception looks like it's from a network error + * @param e the exception from a checkDiskError call + * @return true if this exception is network related, false otherwise + */ + protected boolean isNetworkRelatedException(Exception e) { + if (e instanceof SocketException + || e instanceof SocketTimeoutException + || e instanceof ClosedChannelException + || e instanceof ClosedByInterruptException) { + return true; + } + + String msg = e.getMessage(); + + return null != msg + && (msg.startsWith("An established connection was aborted") + || msg.startsWith("Broken pipe") + || msg.startsWith("Connection reset") + || msg.contains("java.nio.channels.SocketChannel")); + } + /** * Check if there is a disk failure and if so, handle the error */ diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDiskError.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDiskError.java index 6b70cbfc59..e36005bafc 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDiskError.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDiskError.java @@ -18,12 +18,16 @@ package org.apache.hadoop.hdfs.server.datanode; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.DataOutputStream; import java.io.File; import java.net.InetSocketAddress; import java.net.Socket; +import java.net.SocketException; +import java.net.SocketTimeoutException; +import java.nio.channels.ClosedChannelException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -196,4 +200,16 @@ public void testLocalDirs() throws Exception { } } } + + @Test + public void testNetworkErrorsIgnored() { + DataNode dn = cluster.getDataNodes().iterator().next(); + + assertTrue(dn.isNetworkRelatedException(new SocketException())); + assertTrue(dn.isNetworkRelatedException(new SocketTimeoutException())); + assertTrue(dn.isNetworkRelatedException(new ClosedChannelException())); + assertTrue(dn.isNetworkRelatedException(new Exception("Broken pipe foo bar"))); + assertFalse(dn.isNetworkRelatedException(new Exception())); + assertFalse(dn.isNetworkRelatedException(new Exception("random problem"))); + } }