HDFS-4596. Shutting down namenode during checkpointing can lead to md5sum error. Contributed by Andrew Wang.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1456630 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Aaron Myers 2013-03-14 20:01:03 +00:00
parent bcabbcdf4c
commit 807e08334e
4 changed files with 64 additions and 2 deletions

View File

@ -417,6 +417,9 @@ Release 2.0.5-beta - UNRELEASED
HDFS-3277. fail over to loading a different FSImage if the first one we
try to load is corrupt. (Colin Patrick McCabe and Andrew Wang via atm)
HDFS-4596. Shutting down namenode during checkpointing can lead to md5sum
error. (Andrew Wang via atm)
Release 2.0.4-alpha - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -44,4 +44,5 @@ public boolean shouldCorruptAByte(File localfile) {
return false;
}
public void afterMD5Rename() throws IOException {}
}

View File

@ -1103,7 +1103,7 @@ void endCheckpoint(CheckpointSignature sig) throws IOException {
*/
public synchronized void saveDigestAndRenameCheckpointImage(
long txid, MD5Hash digest) throws IOException {
renameCheckpoint(txid);
// Write and rename MD5 file
List<StorageDirectory> badSds = Lists.newArrayList();
for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) {
@ -1116,6 +1116,10 @@ public synchronized void saveDigestAndRenameCheckpointImage(
}
storage.reportErrorsOnDirectories(badSds);
CheckpointFaultInjector.getInstance().afterMD5Rename();
// Rename image from tmp file
renameCheckpoint(txid);
// So long as this is the newest image available,
// advertise it as such to other checkpointers
// from now on

View File

@ -231,7 +231,7 @@ public void testWriteTransactionIdHandlesIOE() throws Exception {
/*
* Simulate exception during edit replay.
*/
@Test(timeout=5000)
@Test(timeout=30000)
public void testReloadOnEditReplayFailure () throws IOException {
Configuration conf = new HdfsConfiguration();
FSDataOutputStream fos = null;
@ -1411,6 +1411,60 @@ public void testSecondaryImageDownload() throws IOException {
}
}
/**
* Test NN restart if a failure happens in between creating the fsimage
* MD5 file and renaming the fsimage.
*/
@Test(timeout=30000)
public void testFailureBeforeRename () throws IOException {
Configuration conf = new HdfsConfiguration();
FSDataOutputStream fos = null;
SecondaryNameNode secondary = null;
MiniDFSCluster cluster = null;
FileSystem fs = null;
NameNode namenode = null;
try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDatanodes)
.build();
cluster.waitActive();
namenode = cluster.getNameNode();
fs = cluster.getFileSystem();
secondary = startSecondaryNameNode(conf);
fos = fs.create(new Path("tmpfile0"));
fos.write(new byte[] { 0, 1, 2, 3 });
secondary.doCheckpoint();
fos.write(new byte[] { 0, 1, 2, 3 });
fos.hsync();
// Cause merge to fail in next checkpoint.
Mockito.doThrow(new IOException(
"Injecting failure after MD5Rename"))
.when(faultInjector).afterMD5Rename();
try {
secondary.doCheckpoint();
fail("Fault injection failed.");
} catch (IOException ioe) {
// This is expected.
}
Mockito.reset(faultInjector);
// Namenode should still restart successfully
cluster.restartNameNode();
} finally {
if (secondary != null) {
secondary.shutdown();
}
if (fs != null) {
fs.close();
}
if (cluster != null) {
cluster.shutdown();
}
Mockito.reset(faultInjector);
}
}
/**
* Test case where two secondary namenodes are checkpointing the same
* NameNode. This differs from {@link #testMultipleSecondaryNamenodes()}