HDFS-2912. Namenode not shutting down when shared edits dir is inaccessible. Contributed by Bikas Saha.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1242564 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2c9ca86c9c
commit
467059b4ab
@ -191,3 +191,4 @@ HDFS-2924. Standby checkpointing fails to authenticate in secure cluster. (todd)
|
|||||||
|
|
||||||
HDFS-2915. HA: TestFailureOfSharedDir.testFailureOfSharedDir() has race condition. (Bikas Saha via jitendra)
|
HDFS-2915. HA: TestFailureOfSharedDir.testFailureOfSharedDir() has race condition. (Bikas Saha via jitendra)
|
||||||
|
|
||||||
|
HDFS-2912. Namenode not shutting down when shared edits dir is inaccessible. (Bikas Saha via atm)
|
||||||
|
@ -805,6 +805,14 @@ List<JournalAndStream> getJournals() {
|
|||||||
return journalSet.getAllJournalStreams();
|
return journalSet.getAllJournalStreams();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used only by tests.
|
||||||
|
*/
|
||||||
|
@VisibleForTesting
|
||||||
|
public JournalSet getJournalSet() {
|
||||||
|
return journalSet;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Used only by unit tests.
|
* Used only by unit tests.
|
||||||
*/
|
*/
|
||||||
|
@ -25,8 +25,10 @@
|
|||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.classification.InterfaceAudience;
|
||||||
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
|
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
|
||||||
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
|
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import com.google.common.base.Preconditions;
|
import com.google.common.base.Preconditions;
|
||||||
import com.google.common.collect.ImmutableList;
|
import com.google.common.collect.ImmutableList;
|
||||||
@ -35,8 +37,6 @@
|
|||||||
import com.google.common.collect.Multimaps;
|
import com.google.common.collect.Multimaps;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import org.apache.hadoop.classification.InterfaceAudience;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Manages a collection of Journals. None of the methods are synchronized, it is
|
* Manages a collection of Journals. None of the methods are synchronized, it is
|
||||||
* assumed that FSEditLog methods, that use this class, use proper
|
* assumed that FSEditLog methods, that use this class, use proper
|
||||||
@ -148,11 +148,17 @@ public boolean isRequired() {
|
|||||||
|
|
||||||
private List<JournalAndStream> journals = Lists.newArrayList();
|
private List<JournalAndStream> journals = Lists.newArrayList();
|
||||||
final int minimumRedundantJournals;
|
final int minimumRedundantJournals;
|
||||||
|
private volatile Runtime runtime = Runtime.getRuntime();
|
||||||
|
|
||||||
JournalSet(int minimumRedundantResources) {
|
JournalSet(int minimumRedundantResources) {
|
||||||
this.minimumRedundantJournals = minimumRedundantResources;
|
this.minimumRedundantJournals = minimumRedundantResources;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public void setRuntimeForTesting(Runtime runtime) {
|
||||||
|
this.runtime = runtime;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public EditLogOutputStream startLogSegment(final long txId) throws IOException {
|
public EditLogOutputStream startLogSegment(final long txId) throws IOException {
|
||||||
mapJournalsAndReportErrors(new JournalClosure() {
|
mapJournalsAndReportErrors(new JournalClosure() {
|
||||||
@ -323,6 +329,12 @@ private void mapJournalsAndReportErrors(
|
|||||||
// continue on any of the other journals. Abort them to ensure that
|
// continue on any of the other journals. Abort them to ensure that
|
||||||
// retry behavior doesn't allow them to keep going in any way.
|
// retry behavior doesn't allow them to keep going in any way.
|
||||||
abortAllJournals();
|
abortAllJournals();
|
||||||
|
// the current policy is to shutdown the NN on errors to shared edits
|
||||||
|
// dir. There are many code paths to shared edits failures - syncs,
|
||||||
|
// roll of edits etc. All of them go through this common function
|
||||||
|
// where the isRequired() check is made. Applying exit policy here
|
||||||
|
// to catch all code paths.
|
||||||
|
runtime.exit(1);
|
||||||
throw new IOException(msg);
|
throw new IOException(msg);
|
||||||
} else {
|
} else {
|
||||||
LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
|
LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
|
||||||
|
@ -40,6 +40,7 @@
|
|||||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||||
import org.apache.hadoop.test.GenericTestUtils;
|
import org.apache.hadoop.test.GenericTestUtils;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
|
||||||
import com.google.common.base.Joiner;
|
import com.google.common.base.Joiner;
|
||||||
|
|
||||||
@ -129,7 +130,6 @@ public void testFailureOfSharedDir() throws Exception {
|
|||||||
|
|
||||||
// The shared edits dir will automatically be marked required.
|
// The shared edits dir will automatically be marked required.
|
||||||
MiniDFSCluster cluster = null;
|
MiniDFSCluster cluster = null;
|
||||||
int chmodSucceeded = -1;
|
|
||||||
File sharedEditsDir = null;
|
File sharedEditsDir = null;
|
||||||
try {
|
try {
|
||||||
cluster = new MiniDFSCluster.Builder(conf)
|
cluster = new MiniDFSCluster.Builder(conf)
|
||||||
@ -145,16 +145,15 @@ public void testFailureOfSharedDir() throws Exception {
|
|||||||
assertTrue(fs.mkdirs(new Path("/test1")));
|
assertTrue(fs.mkdirs(new Path("/test1")));
|
||||||
|
|
||||||
// Blow away the shared edits dir.
|
// Blow away the shared edits dir.
|
||||||
|
Runtime mockRuntime = Mockito.mock(Runtime.class);
|
||||||
URI sharedEditsUri = cluster.getSharedEditsDir(0, 1);
|
URI sharedEditsUri = cluster.getSharedEditsDir(0, 1);
|
||||||
sharedEditsDir = new File(sharedEditsUri);
|
sharedEditsDir = new File(sharedEditsUri);
|
||||||
chmodSucceeded = FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "-w",
|
assertEquals(0, FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "-w",
|
||||||
true);
|
true));
|
||||||
if (chmodSucceeded != 0) {
|
|
||||||
LOG.error("Failed to remove write permissions on shared edits dir:"
|
|
||||||
+ sharedEditsDir.getAbsolutePath());
|
|
||||||
}
|
|
||||||
|
|
||||||
NameNode nn0 = cluster.getNameNode(0);
|
NameNode nn0 = cluster.getNameNode(0);
|
||||||
|
nn0.getNamesystem().getFSImage().getEditLog().getJournalSet()
|
||||||
|
.setRuntimeForTesting(mockRuntime);
|
||||||
try {
|
try {
|
||||||
// Make sure that subsequent operations on the NN fail.
|
// Make sure that subsequent operations on the NN fail.
|
||||||
nn0.getRpcServer().rollEditLog();
|
nn0.getRpcServer().rollEditLog();
|
||||||
@ -163,6 +162,12 @@ public void testFailureOfSharedDir() throws Exception {
|
|||||||
GenericTestUtils.assertExceptionContains(
|
GenericTestUtils.assertExceptionContains(
|
||||||
"Unable to start log segment 4: too few journals successfully started",
|
"Unable to start log segment 4: too few journals successfully started",
|
||||||
ioe);
|
ioe);
|
||||||
|
// By current policy the NN should exit upon this error.
|
||||||
|
// exit() should be called once, but since it is mocked, exit gets
|
||||||
|
// called once during FSEditsLog.endCurrentLogSegment() and then after
|
||||||
|
// that during FSEditsLog.startLogSegment(). So the check is atLeast(1)
|
||||||
|
Mockito.verify(mockRuntime, Mockito.atLeastOnce()).exit(
|
||||||
|
Mockito.anyInt());
|
||||||
LOG.info("Got expected exception", ioe);
|
LOG.info("Got expected exception", ioe);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -179,7 +184,7 @@ public void testFailureOfSharedDir() throws Exception {
|
|||||||
NNStorage.getInProgressEditsFileName(1));
|
NNStorage.getInProgressEditsFileName(1));
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
if (chmodSucceeded == 0) {
|
if (sharedEditsDir != null) {
|
||||||
// without this test cleanup will fail
|
// without this test cleanup will fail
|
||||||
FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "+w", true);
|
FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "+w", true);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user