HDFS-2912. Namenode not shutting down when shared edits dir is inaccessible. Contributed by Bikas Saha.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1242564 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Aaron Myers 2012-02-09 22:23:47 +00:00
parent 2c9ca86c9c
commit 467059b4ab
4 changed files with 36 additions and 10 deletions

View File

@ -191,3 +191,4 @@ HDFS-2924. Standby checkpointing fails to authenticate in secure cluster. (todd)
HDFS-2915. HA: TestFailureOfSharedDir.testFailureOfSharedDir() has race condition. (Bikas Saha via jitendra) HDFS-2915. HA: TestFailureOfSharedDir.testFailureOfSharedDir() has race condition. (Bikas Saha via jitendra)
HDFS-2912. Namenode not shutting down when shared edits dir is inaccessible. (Bikas Saha via atm)

View File

@ -805,6 +805,14 @@ List<JournalAndStream> getJournals() {
return journalSet.getAllJournalStreams(); return journalSet.getAllJournalStreams();
} }
/**
* Used only by tests.
*/
@VisibleForTesting
public JournalSet getJournalSet() {
return journalSet;
}
/** /**
* Used only by unit tests. * Used only by unit tests.
*/ */

View File

@ -25,8 +25,10 @@
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog; import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
import com.google.common.annotations.VisibleForTesting; import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions; import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableList;
@ -35,8 +37,6 @@
import com.google.common.collect.Multimaps; import com.google.common.collect.Multimaps;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import org.apache.hadoop.classification.InterfaceAudience;
/** /**
* Manages a collection of Journals. None of the methods are synchronized, it is * Manages a collection of Journals. None of the methods are synchronized, it is
* assumed that FSEditLog methods, that use this class, use proper * assumed that FSEditLog methods, that use this class, use proper
@ -148,11 +148,17 @@ public boolean isRequired() {
private List<JournalAndStream> journals = Lists.newArrayList(); private List<JournalAndStream> journals = Lists.newArrayList();
final int minimumRedundantJournals; final int minimumRedundantJournals;
private volatile Runtime runtime = Runtime.getRuntime();
JournalSet(int minimumRedundantResources) { JournalSet(int minimumRedundantResources) {
this.minimumRedundantJournals = minimumRedundantResources; this.minimumRedundantJournals = minimumRedundantResources;
} }
@VisibleForTesting
public void setRuntimeForTesting(Runtime runtime) {
this.runtime = runtime;
}
@Override @Override
public EditLogOutputStream startLogSegment(final long txId) throws IOException { public EditLogOutputStream startLogSegment(final long txId) throws IOException {
mapJournalsAndReportErrors(new JournalClosure() { mapJournalsAndReportErrors(new JournalClosure() {
@ -323,6 +329,12 @@ private void mapJournalsAndReportErrors(
// continue on any of the other journals. Abort them to ensure that // continue on any of the other journals. Abort them to ensure that
// retry behavior doesn't allow them to keep going in any way. // retry behavior doesn't allow them to keep going in any way.
abortAllJournals(); abortAllJournals();
// the current policy is to shutdown the NN on errors to shared edits
// dir. There are many code paths to shared edits failures - syncs,
// roll of edits etc. All of them go through this common function
// where the isRequired() check is made. Applying exit policy here
// to catch all code paths.
runtime.exit(1);
throw new IOException(msg); throw new IOException(msg);
} else { } else {
LOG.error("Error: " + status + " failed for (journal " + jas + ")", t); LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);

View File

@ -40,6 +40,7 @@
import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.GenericTestUtils;
import org.junit.Test; import org.junit.Test;
import org.mockito.Mockito;
import com.google.common.base.Joiner; import com.google.common.base.Joiner;
@ -129,7 +130,6 @@ public void testFailureOfSharedDir() throws Exception {
// The shared edits dir will automatically be marked required. // The shared edits dir will automatically be marked required.
MiniDFSCluster cluster = null; MiniDFSCluster cluster = null;
int chmodSucceeded = -1;
File sharedEditsDir = null; File sharedEditsDir = null;
try { try {
cluster = new MiniDFSCluster.Builder(conf) cluster = new MiniDFSCluster.Builder(conf)
@ -145,16 +145,15 @@ public void testFailureOfSharedDir() throws Exception {
assertTrue(fs.mkdirs(new Path("/test1"))); assertTrue(fs.mkdirs(new Path("/test1")));
// Blow away the shared edits dir. // Blow away the shared edits dir.
Runtime mockRuntime = Mockito.mock(Runtime.class);
URI sharedEditsUri = cluster.getSharedEditsDir(0, 1); URI sharedEditsUri = cluster.getSharedEditsDir(0, 1);
sharedEditsDir = new File(sharedEditsUri); sharedEditsDir = new File(sharedEditsUri);
chmodSucceeded = FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "-w", assertEquals(0, FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "-w",
true); true));
if (chmodSucceeded != 0) {
LOG.error("Failed to remove write permissions on shared edits dir:"
+ sharedEditsDir.getAbsolutePath());
}
NameNode nn0 = cluster.getNameNode(0); NameNode nn0 = cluster.getNameNode(0);
nn0.getNamesystem().getFSImage().getEditLog().getJournalSet()
.setRuntimeForTesting(mockRuntime);
try { try {
// Make sure that subsequent operations on the NN fail. // Make sure that subsequent operations on the NN fail.
nn0.getRpcServer().rollEditLog(); nn0.getRpcServer().rollEditLog();
@ -163,6 +162,12 @@ public void testFailureOfSharedDir() throws Exception {
GenericTestUtils.assertExceptionContains( GenericTestUtils.assertExceptionContains(
"Unable to start log segment 4: too few journals successfully started", "Unable to start log segment 4: too few journals successfully started",
ioe); ioe);
// By current policy the NN should exit upon this error.
// exit() should be called once, but since it is mocked, exit gets
// called once during FSEditsLog.endCurrentLogSegment() and then after
// that during FSEditsLog.startLogSegment(). So the check is atLeast(1)
Mockito.verify(mockRuntime, Mockito.atLeastOnce()).exit(
Mockito.anyInt());
LOG.info("Got expected exception", ioe); LOG.info("Got expected exception", ioe);
} }
@ -179,7 +184,7 @@ public void testFailureOfSharedDir() throws Exception {
NNStorage.getInProgressEditsFileName(1)); NNStorage.getInProgressEditsFileName(1));
} }
} finally { } finally {
if (chmodSucceeded == 0) { if (sharedEditsDir != null) {
// without this test cleanup will fail // without this test cleanup will fail
FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "+w", true); FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "+w", true);
} }