From 467059b4ab5fcb8251b57c60ec3ddfce30c486c2 Mon Sep 17 00:00:00 2001 From: Aaron Myers Date: Thu, 9 Feb 2012 22:23:47 +0000 Subject: [PATCH] HDFS-2912. Namenode not shutting down when shared edits dir is inaccessible. Contributed by Bikas Saha. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1242564 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop-hdfs/CHANGES.HDFS-1623.txt | 1 + .../hdfs/server/namenode/FSEditLog.java | 8 +++++++ .../hdfs/server/namenode/JournalSet.java | 16 ++++++++++++-- .../namenode/ha/TestFailureOfSharedDir.java | 21 ++++++++++++------- 4 files changed, 36 insertions(+), 10 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt index eac0563d7f..07d61b6083 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt @@ -191,3 +191,4 @@ HDFS-2924. Standby checkpointing fails to authenticate in secure cluster. (todd) HDFS-2915. HA: TestFailureOfSharedDir.testFailureOfSharedDir() has race condition. (Bikas Saha via jitendra) +HDFS-2912. Namenode not shutting down when shared edits dir is inaccessible. (Bikas Saha via atm) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java index d9a64589ce..2c72a7d5f2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java @@ -805,6 +805,14 @@ List getJournals() { return journalSet.getAllJournalStreams(); } + /** + * Used only by tests. + */ + @VisibleForTesting + public JournalSet getJournalSet() { + return journalSet; + } + /** * Used only by unit tests. */ diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java index 8fc323c31d..d84d79dcb5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java @@ -25,8 +25,10 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog; import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; + import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; @@ -35,8 +37,6 @@ import com.google.common.collect.Multimaps; import com.google.common.collect.Sets; -import org.apache.hadoop.classification.InterfaceAudience; - /** * Manages a collection of Journals. None of the methods are synchronized, it is * assumed that FSEditLog methods, that use this class, use proper @@ -148,11 +148,17 @@ public boolean isRequired() { private List journals = Lists.newArrayList(); final int minimumRedundantJournals; + private volatile Runtime runtime = Runtime.getRuntime(); JournalSet(int minimumRedundantResources) { this.minimumRedundantJournals = minimumRedundantResources; } + @VisibleForTesting + public void setRuntimeForTesting(Runtime runtime) { + this.runtime = runtime; + } + @Override public EditLogOutputStream startLogSegment(final long txId) throws IOException { mapJournalsAndReportErrors(new JournalClosure() { @@ -323,6 +329,12 @@ private void mapJournalsAndReportErrors( // continue on any of the other journals. Abort them to ensure that // retry behavior doesn't allow them to keep going in any way. abortAllJournals(); + // the current policy is to shutdown the NN on errors to shared edits + // dir. There are many code paths to shared edits failures - syncs, + // roll of edits etc. All of them go through this common function + // where the isRequired() check is made. Applying exit policy here + // to catch all code paths. + runtime.exit(1); throw new IOException(msg); } else { LOG.error("Error: " + status + " failed for (journal " + jas + ")", t); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java index 71098d8aaa..cc9552aec2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java @@ -40,6 +40,7 @@ import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.test.GenericTestUtils; import org.junit.Test; +import org.mockito.Mockito; import com.google.common.base.Joiner; @@ -129,7 +130,6 @@ public void testFailureOfSharedDir() throws Exception { // The shared edits dir will automatically be marked required. MiniDFSCluster cluster = null; - int chmodSucceeded = -1; File sharedEditsDir = null; try { cluster = new MiniDFSCluster.Builder(conf) @@ -145,16 +145,15 @@ public void testFailureOfSharedDir() throws Exception { assertTrue(fs.mkdirs(new Path("/test1"))); // Blow away the shared edits dir. + Runtime mockRuntime = Mockito.mock(Runtime.class); URI sharedEditsUri = cluster.getSharedEditsDir(0, 1); sharedEditsDir = new File(sharedEditsUri); - chmodSucceeded = FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "-w", - true); - if (chmodSucceeded != 0) { - LOG.error("Failed to remove write permissions on shared edits dir:" - + sharedEditsDir.getAbsolutePath()); - } + assertEquals(0, FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "-w", + true)); NameNode nn0 = cluster.getNameNode(0); + nn0.getNamesystem().getFSImage().getEditLog().getJournalSet() + .setRuntimeForTesting(mockRuntime); try { // Make sure that subsequent operations on the NN fail. nn0.getRpcServer().rollEditLog(); @@ -163,6 +162,12 @@ public void testFailureOfSharedDir() throws Exception { GenericTestUtils.assertExceptionContains( "Unable to start log segment 4: too few journals successfully started", ioe); + // By current policy the NN should exit upon this error. + // exit() should be called once, but since it is mocked, exit gets + // called once during FSEditsLog.endCurrentLogSegment() and then after + // that during FSEditsLog.startLogSegment(). So the check is atLeast(1) + Mockito.verify(mockRuntime, Mockito.atLeastOnce()).exit( + Mockito.anyInt()); LOG.info("Got expected exception", ioe); } @@ -179,7 +184,7 @@ public void testFailureOfSharedDir() throws Exception { NNStorage.getInProgressEditsFileName(1)); } } finally { - if (chmodSucceeded == 0) { + if (sharedEditsDir != null) { // without this test cleanup will fail FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "+w", true); }