HDFS-5159. Secondary NameNode fails to checkpoint if error occurs downloading edits on first checkpoint. Contributed by Aaron T. Myers.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1520363 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Aaron Myers 2013-09-05 16:21:26 +00:00
parent e8f6f74025
commit 8aea748ec3
3 changed files with 60 additions and 8 deletions

View File

@ -415,6 +415,9 @@ Release 2.1.1-beta - UNRELEASED
HDFS-5140. Too many safemode monitor threads being created in the standby HDFS-5140. Too many safemode monitor threads being created in the standby
namenode causing it to fail with out of memory error. (jing9) namenode causing it to fail with out of memory error. (jing9)
HDFS-5159. Secondary NameNode fails to checkpoint if error occurs
downloading edits on first checkpoint. (atm)
Release 2.1.0-beta - 2013-08-22 Release 2.1.0-beta - 2013-08-22
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -429,10 +429,8 @@ public Boolean run() throws Exception {
dstImage.getStorage().cTime = sig.cTime; dstImage.getStorage().cTime = sig.cTime;
// get fsimage // get fsimage
boolean downloadImage = true;
if (sig.mostRecentCheckpointTxId == if (sig.mostRecentCheckpointTxId ==
dstImage.getStorage().getMostRecentCheckpointTxId()) { dstImage.getStorage().getMostRecentCheckpointTxId()) {
downloadImage = false;
LOG.info("Image has not changed. Will not download image."); LOG.info("Image has not changed. Will not download image.");
} else { } else {
LOG.info("Image has changed. Downloading updated image from NN."); LOG.info("Image has changed. Downloading updated image from NN.");
@ -448,7 +446,9 @@ public Boolean run() throws Exception {
nnHostPort, log, dstImage.getStorage()); nnHostPort, log, dstImage.getStorage());
} }
return Boolean.valueOf(downloadImage); // true if we haven't loaded all the transactions represented by the
// downloaded fsimage.
return dstImage.getLastAppliedTxId() < sig.mostRecentCheckpointTxId;
} }
}); });
return b.booleanValue(); return b.booleanValue();

View File

@ -39,7 +39,6 @@
import java.util.List; import java.util.List;
import org.apache.commons.cli.ParseException; import org.apache.commons.cli.ParseException;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.impl.Log4JLogger; import org.apache.commons.logging.impl.Log4JLogger;
@ -1224,7 +1223,6 @@ public void testSaveNamespace() throws IOException {
} }
/* Test case to test CheckpointSignature */ /* Test case to test CheckpointSignature */
@SuppressWarnings("deprecation")
@Test @Test
public void testCheckpointSignature() throws IOException { public void testCheckpointSignature() throws IOException {
@ -1563,11 +1561,64 @@ public void testEditFailureBeforeRename() throws IOException {
} }
} }
/**
* Test that a fault while downloading edits the first time after the 2NN
* starts up does not prevent future checkpointing.
*/
@Test(timeout = 30000)
public void testEditFailureOnFirstCheckpoint() throws IOException {
Configuration conf = new HdfsConfiguration();
SecondaryNameNode secondary = null;
MiniDFSCluster cluster = null;
FileSystem fs = null;
try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDatanodes)
.build();
cluster.waitActive();
fs = cluster.getFileSystem();
fs.mkdirs(new Path("test-file-1"));
// Make sure the on-disk fsimage on the NN has txid > 0.
FSNamesystem fsns = cluster.getNamesystem();
fsns.enterSafeMode(false);
fsns.saveNamespace();
fsns.leaveSafeMode();
secondary = startSecondaryNameNode(conf);
// Cause edit rename to fail during next checkpoint
Mockito.doThrow(new IOException("Injecting failure before edit rename"))
.when(faultInjector).beforeEditsRename();
try {
secondary.doCheckpoint();
fail("Fault injection failed.");
} catch (IOException ioe) {
GenericTestUtils.assertExceptionContains(
"Injecting failure before edit rename", ioe);
}
Mockito.reset(faultInjector);
// Next checkpoint should succeed
secondary.doCheckpoint();
} finally {
if (secondary != null) {
secondary.shutdown();
}
if (fs != null) {
fs.close();
}
if (cluster != null) {
cluster.shutdown();
}
Mockito.reset(faultInjector);
}
}
/** /**
* Test that the secondary namenode correctly deletes temporary edits * Test that the secondary namenode correctly deletes temporary edits
* on startup. * on startup.
*/ */
@Test(timeout = 30000) @Test(timeout = 30000)
public void testDeleteTemporaryEditsOnStartup() throws IOException { public void testDeleteTemporaryEditsOnStartup() throws IOException {
Configuration conf = new HdfsConfiguration(); Configuration conf = new HdfsConfiguration();
@ -1943,7 +1994,6 @@ public void testNamespaceVerifiedOnFileTransfer() throws IOException {
* Test that, if a storage directory is failed when a checkpoint occurs, * Test that, if a storage directory is failed when a checkpoint occurs,
* the non-failed storage directory receives the checkpoint. * the non-failed storage directory receives the checkpoint.
*/ */
@SuppressWarnings("deprecation")
@Test @Test
public void testCheckpointWithFailedStorageDir() throws Exception { public void testCheckpointWithFailedStorageDir() throws Exception {
MiniDFSCluster cluster = null; MiniDFSCluster cluster = null;
@ -2006,7 +2056,6 @@ public void testCheckpointWithFailedStorageDir() throws Exception {
* should function correctly. * should function correctly.
* @throws Exception * @throws Exception
*/ */
@SuppressWarnings("deprecation")
@Test @Test
public void testCheckpointWithSeparateDirsAfterNameFails() throws Exception { public void testCheckpointWithSeparateDirsAfterNameFails() throws Exception {
MiniDFSCluster cluster = null; MiniDFSCluster cluster = null;