HDFS-4596. Shutting down namenode during checkpointing can lead to md5sum error. Contributed by Andrew Wang.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1456630 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bcabbcdf4c
commit
807e08334e
@ -417,6 +417,9 @@ Release 2.0.5-beta - UNRELEASED
|
||||
HDFS-3277. fail over to loading a different FSImage if the first one we
|
||||
try to load is corrupt. (Colin Patrick McCabe and Andrew Wang via atm)
|
||||
|
||||
HDFS-4596. Shutting down namenode during checkpointing can lead to md5sum
|
||||
error. (Andrew Wang via atm)
|
||||
|
||||
Release 2.0.4-alpha - UNRELEASED
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
@ -44,4 +44,5 @@ public boolean shouldCorruptAByte(File localfile) {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void afterMD5Rename() throws IOException {}
|
||||
}
|
||||
|
@ -1103,7 +1103,7 @@ void endCheckpoint(CheckpointSignature sig) throws IOException {
|
||||
*/
|
||||
public synchronized void saveDigestAndRenameCheckpointImage(
|
||||
long txid, MD5Hash digest) throws IOException {
|
||||
renameCheckpoint(txid);
|
||||
// Write and rename MD5 file
|
||||
List<StorageDirectory> badSds = Lists.newArrayList();
|
||||
|
||||
for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) {
|
||||
@ -1116,6 +1116,10 @@ public synchronized void saveDigestAndRenameCheckpointImage(
|
||||
}
|
||||
storage.reportErrorsOnDirectories(badSds);
|
||||
|
||||
CheckpointFaultInjector.getInstance().afterMD5Rename();
|
||||
|
||||
// Rename image from tmp file
|
||||
renameCheckpoint(txid);
|
||||
// So long as this is the newest image available,
|
||||
// advertise it as such to other checkpointers
|
||||
// from now on
|
||||
|
@ -231,7 +231,7 @@ public void testWriteTransactionIdHandlesIOE() throws Exception {
|
||||
/*
|
||||
* Simulate exception during edit replay.
|
||||
*/
|
||||
@Test(timeout=5000)
|
||||
@Test(timeout=30000)
|
||||
public void testReloadOnEditReplayFailure () throws IOException {
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
FSDataOutputStream fos = null;
|
||||
@ -1411,6 +1411,60 @@ public void testSecondaryImageDownload() throws IOException {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test NN restart if a failure happens in between creating the fsimage
|
||||
* MD5 file and renaming the fsimage.
|
||||
*/
|
||||
@Test(timeout=30000)
|
||||
public void testFailureBeforeRename () throws IOException {
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
FSDataOutputStream fos = null;
|
||||
SecondaryNameNode secondary = null;
|
||||
MiniDFSCluster cluster = null;
|
||||
FileSystem fs = null;
|
||||
NameNode namenode = null;
|
||||
|
||||
try {
|
||||
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDatanodes)
|
||||
.build();
|
||||
cluster.waitActive();
|
||||
namenode = cluster.getNameNode();
|
||||
fs = cluster.getFileSystem();
|
||||
secondary = startSecondaryNameNode(conf);
|
||||
fos = fs.create(new Path("tmpfile0"));
|
||||
fos.write(new byte[] { 0, 1, 2, 3 });
|
||||
secondary.doCheckpoint();
|
||||
fos.write(new byte[] { 0, 1, 2, 3 });
|
||||
fos.hsync();
|
||||
|
||||
// Cause merge to fail in next checkpoint.
|
||||
Mockito.doThrow(new IOException(
|
||||
"Injecting failure after MD5Rename"))
|
||||
.when(faultInjector).afterMD5Rename();
|
||||
|
||||
try {
|
||||
secondary.doCheckpoint();
|
||||
fail("Fault injection failed.");
|
||||
} catch (IOException ioe) {
|
||||
// This is expected.
|
||||
}
|
||||
Mockito.reset(faultInjector);
|
||||
// Namenode should still restart successfully
|
||||
cluster.restartNameNode();
|
||||
} finally {
|
||||
if (secondary != null) {
|
||||
secondary.shutdown();
|
||||
}
|
||||
if (fs != null) {
|
||||
fs.close();
|
||||
}
|
||||
if (cluster != null) {
|
||||
cluster.shutdown();
|
||||
}
|
||||
Mockito.reset(faultInjector);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test case where two secondary namenodes are checkpointing the same
|
||||
* NameNode. This differs from {@link #testMultipleSecondaryNamenodes()}
|
||||
|
Loading…
Reference in New Issue
Block a user