HDFS-2011. Removal and restoration of storage directories on checkpointing failure doesn't work properly. Contributed by Ravi Prakash.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1141748 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Matthew Foley 2011-06-30 23:21:58 +00:00
parent 3af51887b4
commit 7bd41f031f
4 changed files with 98 additions and 10 deletions

View File

@ -556,6 +556,9 @@ Trunk (unreleased changes)
BUG FIXES BUG FIXES
HDFS-2011. Removal and restoration of storage directories on checkpointing
failure doesn't work properly. (Ravi Prakash via mattf)
HDFS-1955. FSImage.doUpgrade() was made too fault-tolerant by HDFS-1826. HDFS-1955. FSImage.doUpgrade() was made too fault-tolerant by HDFS-1826.
(mattf) (mattf)

View File

@ -122,19 +122,31 @@ void create() throws IOException {
public void close() throws IOException { public void close() throws IOException {
// close should have been called after all pending transactions // close should have been called after all pending transactions
// have been flushed & synced. // have been flushed & synced.
// if already closed, just skip
if(bufCurrent != null)
{
int bufSize = bufCurrent.size(); int bufSize = bufCurrent.size();
if (bufSize != 0) { if (bufSize != 0) {
throw new IOException("FSEditStream has " + bufSize throw new IOException("FSEditStream has " + bufSize
+ " bytes still to be flushed and cannot " + "be closed."); + " bytes still to be flushed and cannot " + "be closed.");
} }
bufCurrent.close(); bufCurrent.close();
bufCurrent = null;
}
if(bufReady != null) {
bufReady.close(); bufReady.close();
bufReady = null;
}
// remove the last INVALID marker from transaction log. // remove the last INVALID marker from transaction log.
if (fc != null && fc.isOpen()) {
fc.truncate(fc.position()); fc.truncate(fc.position());
fc.close();
}
if (fp != null) {
fp.close(); fp.close();
}
bufCurrent = bufReady = null;
} }
/** /**

View File

@ -511,6 +511,12 @@ public void setCheckpointTimeInStorage(long newCpT) {
// Close any edits stream associated with this dir and remove directory // Close any edits stream associated with this dir and remove directory
LOG.warn("incrementCheckpointTime failed on " LOG.warn("incrementCheckpointTime failed on "
+ sd.getRoot().getPath() + ";type="+sd.getStorageDirType()); + sd.getRoot().getPath() + ";type="+sd.getStorageDirType());
try {
reportErrorsOnDirectory(sd);
} catch (IOException ioe) {
LOG.error("Failed to report and remove NN storage directory "
+ sd.getRoot().getPath(), ioe);
}
} }
} }
} }

View File

@ -21,6 +21,7 @@
import java.io.*; import java.io.*;
import java.net.InetSocketAddress; import java.net.InetSocketAddress;
import java.net.URI; import java.net.URI;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Iterator; import java.util.Iterator;
@ -137,6 +138,72 @@ private void testNamedirError(Configuration conf, Collection<URI> namedirs)
resurrectNameDir(first); // put back namedir resurrectNameDir(first); // put back namedir
} }
/**
* Tests EditLogFileOutputStream doesn't throw NullPointerException on being
* closed twice.
* See https://issues.apache.org/jira/browse/HDFS-2011
*/
public void testEditLogFileOutputStreamCloses()
throws IOException,NullPointerException {
System.out.println("Testing EditLogFileOutputStream doesn't throw " +
"NullPointerException on being closed twice");
File editLogStreamFile = null;
try {
editLogStreamFile = new File(System.getProperty("test.build.data","/tmp"),
"editLogStream.dat");
EditLogFileOutputStream editLogStream =
new EditLogFileOutputStream(editLogStreamFile, 0);
editLogStream.close();
//Closing an twice should not throw a NullPointerException
editLogStream.close();
} finally {
if (editLogStreamFile != null)
// Cleanup the editLogStream.dat file we created
editLogStreamFile.delete();
}
System.out.println("Successfully tested EditLogFileOutputStream doesn't " +
"throw NullPointerException on being closed twice");
}
/**
* Checks that an IOException in NNStorage.setCheckpointTimeInStorage is handled
* correctly (by removing the storage directory)
* See https://issues.apache.org/jira/browse/HDFS-2011
*/
public void testSetCheckpointTimeInStorageHandlesIOException() throws Exception {
System.out.println("Check IOException handled correctly by setCheckpointTimeInStorage");
NNStorage nnStorage = new NNStorage(new HdfsConfiguration());
ArrayList<URI> fsImageDirs = new ArrayList<URI>();
ArrayList<URI> editsDirs = new ArrayList<URI>();
File filePath =
new File(System.getProperty("test.build.data","/tmp"), "storageDirToCheck");
assertTrue("Couldn't create directory storageDirToCheck",
filePath.exists() || filePath.mkdirs());
try {
fsImageDirs.add(filePath.toURI());
editsDirs.add(filePath.toURI());
// Initialize NNStorage
nnStorage.setStorageDirectories(fsImageDirs, editsDirs);
assertTrue("List of storage directories didn't have storageDirToCheck.",
nnStorage.getEditsDirectories().iterator().next().
toString().indexOf("storageDirToCheck") != -1);
assertTrue("List of removed storage directories wasn't empty",
nnStorage.getRemovedStorageDirs().isEmpty());
} finally {
// Delete storage directory to cause IOException in setCheckpointTimeInStorage
assertTrue("Couldn't remove directory " + filePath.getAbsolutePath(),
filePath.delete());
}
// Just call setCheckpointTimeInStorage using any random number
nnStorage.setCheckpointTimeInStorage(1);
List<StorageDirectory> listRsd = nnStorage.getRemovedStorageDirs();
assertTrue("Removed directory wasn't what was expected",
listRsd.size() > 0 && listRsd.get(listRsd.size() - 1).getRoot().
toString().indexOf("storageDirToCheck") != -1);
System.out.println("Successfully checked IOException is handled correctly "
+ "by setCheckpointTimeInStorage");
}
/* /*
* Simulate namenode crashing after rolling edit log. * Simulate namenode crashing after rolling edit log.
*/ */