HDFS-2912. Namenode not shutting down when shared edits dir is inaccessible. Contributed by Bikas Saha.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1242564 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2c9ca86c9c
commit
467059b4ab
@ -191,3 +191,4 @@ HDFS-2924. Standby checkpointing fails to authenticate in secure cluster. (todd)
|
||||
|
||||
HDFS-2915. HA: TestFailureOfSharedDir.testFailureOfSharedDir() has race condition. (Bikas Saha via jitendra)
|
||||
|
||||
HDFS-2912. Namenode not shutting down when shared edits dir is inaccessible. (Bikas Saha via atm)
|
||||
|
@ -805,6 +805,14 @@ List<JournalAndStream> getJournals() {
|
||||
return journalSet.getAllJournalStreams();
|
||||
}
|
||||
|
||||
/**
|
||||
* Used only by tests.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
public JournalSet getJournalSet() {
|
||||
return journalSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Used only by unit tests.
|
||||
*/
|
||||
|
@ -25,8 +25,10 @@
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
|
||||
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
@ -35,8 +37,6 @@
|
||||
import com.google.common.collect.Multimaps;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
|
||||
/**
|
||||
* Manages a collection of Journals. None of the methods are synchronized, it is
|
||||
* assumed that FSEditLog methods, that use this class, use proper
|
||||
@ -148,11 +148,17 @@ public boolean isRequired() {
|
||||
|
||||
private List<JournalAndStream> journals = Lists.newArrayList();
|
||||
final int minimumRedundantJournals;
|
||||
private volatile Runtime runtime = Runtime.getRuntime();
|
||||
|
||||
JournalSet(int minimumRedundantResources) {
|
||||
this.minimumRedundantJournals = minimumRedundantResources;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public void setRuntimeForTesting(Runtime runtime) {
|
||||
this.runtime = runtime;
|
||||
}
|
||||
|
||||
@Override
|
||||
public EditLogOutputStream startLogSegment(final long txId) throws IOException {
|
||||
mapJournalsAndReportErrors(new JournalClosure() {
|
||||
@ -323,6 +329,12 @@ private void mapJournalsAndReportErrors(
|
||||
// continue on any of the other journals. Abort them to ensure that
|
||||
// retry behavior doesn't allow them to keep going in any way.
|
||||
abortAllJournals();
|
||||
// the current policy is to shutdown the NN on errors to shared edits
|
||||
// dir. There are many code paths to shared edits failures - syncs,
|
||||
// roll of edits etc. All of them go through this common function
|
||||
// where the isRequired() check is made. Applying exit policy here
|
||||
// to catch all code paths.
|
||||
runtime.exit(1);
|
||||
throw new IOException(msg);
|
||||
} else {
|
||||
LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
|
||||
|
@ -40,6 +40,7 @@
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.junit.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
|
||||
@ -129,7 +130,6 @@ public void testFailureOfSharedDir() throws Exception {
|
||||
|
||||
// The shared edits dir will automatically be marked required.
|
||||
MiniDFSCluster cluster = null;
|
||||
int chmodSucceeded = -1;
|
||||
File sharedEditsDir = null;
|
||||
try {
|
||||
cluster = new MiniDFSCluster.Builder(conf)
|
||||
@ -145,16 +145,15 @@ public void testFailureOfSharedDir() throws Exception {
|
||||
assertTrue(fs.mkdirs(new Path("/test1")));
|
||||
|
||||
// Blow away the shared edits dir.
|
||||
Runtime mockRuntime = Mockito.mock(Runtime.class);
|
||||
URI sharedEditsUri = cluster.getSharedEditsDir(0, 1);
|
||||
sharedEditsDir = new File(sharedEditsUri);
|
||||
chmodSucceeded = FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "-w",
|
||||
true);
|
||||
if (chmodSucceeded != 0) {
|
||||
LOG.error("Failed to remove write permissions on shared edits dir:"
|
||||
+ sharedEditsDir.getAbsolutePath());
|
||||
}
|
||||
assertEquals(0, FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "-w",
|
||||
true));
|
||||
|
||||
NameNode nn0 = cluster.getNameNode(0);
|
||||
nn0.getNamesystem().getFSImage().getEditLog().getJournalSet()
|
||||
.setRuntimeForTesting(mockRuntime);
|
||||
try {
|
||||
// Make sure that subsequent operations on the NN fail.
|
||||
nn0.getRpcServer().rollEditLog();
|
||||
@ -163,6 +162,12 @@ public void testFailureOfSharedDir() throws Exception {
|
||||
GenericTestUtils.assertExceptionContains(
|
||||
"Unable to start log segment 4: too few journals successfully started",
|
||||
ioe);
|
||||
// By current policy the NN should exit upon this error.
|
||||
// exit() should be called once, but since it is mocked, exit gets
|
||||
// called once during FSEditsLog.endCurrentLogSegment() and then after
|
||||
// that during FSEditsLog.startLogSegment(). So the check is atLeast(1)
|
||||
Mockito.verify(mockRuntime, Mockito.atLeastOnce()).exit(
|
||||
Mockito.anyInt());
|
||||
LOG.info("Got expected exception", ioe);
|
||||
}
|
||||
|
||||
@ -179,7 +184,7 @@ public void testFailureOfSharedDir() throws Exception {
|
||||
NNStorage.getInProgressEditsFileName(1));
|
||||
}
|
||||
} finally {
|
||||
if (chmodSucceeded == 0) {
|
||||
if (sharedEditsDir != null) {
|
||||
// without this test cleanup will fail
|
||||
FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "+w", true);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user