HDFS-3259. NameNode#initializeSharedEdits should populate shared edits dir with edit log segments. Contributed by Aaron T. Myers.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1325518 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Aaron Myers 2012-04-12 21:11:32 +00:00
parent 047a7b276c
commit 07a4367445
5 changed files with 128 additions and 30 deletions

View File

@ -373,6 +373,9 @@ Release 2.0.0 - UNRELEASED
HDFS-2983. Relax the build version check to permit rolling upgrades within
a release. (atm)
HDFS-3259. NameNode#initializeSharedEdits should populate shared edits dir
with edit log segments. (atm)
OPTIMIZATIONS
HDFS-3024. Improve performance of stringification in addStoredBlock (todd)

View File

@ -311,10 +311,12 @@ synchronized void close() {
endCurrentLogSegment(true);
}
try {
journalSet.close();
} catch (IOException ioe) {
LOG.warn("Error closing journalSet", ioe);
if (!journalSet.isEmpty()) {
try {
journalSet.close();
} catch (IOException ioe) {
LOG.warn("Error closing journalSet", ioe);
}
}
state = State.CLOSED;
@ -813,9 +815,8 @@ void logReassignLease(String leaseHolder, String src, String newHolder) {
}
/**
* Used only by unit tests.
* Get all the journals this edit log is currently operating on.
*/
@VisibleForTesting
synchronized List<JournalAndStream> getJournals() {
return journalSet.getAllJournalStreams();
}

View File

@ -344,7 +344,7 @@ synchronized public void recoverUnfinalizedSegments() throws IOException {
}
}
private List<EditLogFile> getLogFiles(long fromTxId) throws IOException {
List<EditLogFile> getLogFiles(long fromTxId) throws IOException {
File currentDir = sd.getCurrentDir();
List<EditLogFile> allLogFiles = matchEditLogs(currentDir);
List<EditLogFile> logFiles = Lists.newArrayList();

View File

@ -18,14 +18,17 @@
package org.apache.hadoop.hdfs.server.namenode;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.InetSocketAddress;
import java.net.URI;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.HadoopIllegalArgumentException;
@ -41,7 +44,6 @@
import org.apache.hadoop.fs.Trash;
import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HAUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
@ -49,6 +51,9 @@
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile;
import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream;
import org.apache.hadoop.hdfs.server.namenode.ha.ActiveState;
import org.apache.hadoop.hdfs.server.namenode.ha.BootstrapStandby;
import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
@ -61,6 +66,8 @@
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.util.AtomicFileOutputStream;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.ipc.StandbyException;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.net.NetUtils;
@ -749,9 +756,10 @@ public static boolean initializeSharedEdits(Configuration conf,
boolean force) {
return initializeSharedEdits(conf, force, false);
}
/**
* Format a new shared edits dir.
* Format a new shared edits dir and copy in enough edit log segments so that
* the standby NN can start up.
*
* @param conf configuration
* @param force format regardless of whether or not the shared edits dir exists
@ -785,8 +793,19 @@ private static boolean initializeSharedEdits(Configuration conf,
existingStorage.getBlockPoolID(),
existingStorage.getCTime(),
existingStorage.getDistributedUpgradeVersion()));
} catch (Exception e) {
LOG.error("Could not format shared edits dir", e);
// Need to make sure the edit log segments are in good shape to initialize
// the shared edits dir.
fsns.getFSImage().getEditLog().close();
fsns.getFSImage().getEditLog().initJournalsForWrite();
fsns.getFSImage().getEditLog().recoverUnclosedStreams();
if (copyEditLogSegmentsToSharedDir(fsns, sharedEditsDirs,
newSharedStorage, conf)) {
return true; // aborted
}
} catch (IOException ioe) {
LOG.error("Could not initialize shared edits dir", ioe);
return true; // aborted
} finally {
// Have to unlock storage explicitly for the case when we're running in a
@ -802,6 +821,44 @@ private static boolean initializeSharedEdits(Configuration conf,
}
return false; // did not abort
}
private static boolean copyEditLogSegmentsToSharedDir(FSNamesystem fsns,
Collection<URI> sharedEditsDirs, NNStorage newSharedStorage,
Configuration conf) throws FileNotFoundException, IOException {
// Copy edit log segments into the new shared edits dir.
for (JournalAndStream jas : fsns.getFSImage().getEditLog().getJournals()) {
FileJournalManager fjm = null;
if (!(jas.getManager() instanceof FileJournalManager)) {
LOG.error("Cannot populate shared edits dir from non-file " +
"journal manager: " + jas.getManager());
return true; // aborted
} else {
fjm = (FileJournalManager) jas.getManager();
}
for (EditLogFile elf : fjm.getLogFiles(fsns.getFSImage()
.getMostRecentCheckpointTxId())) {
File editLogSegment = elf.getFile();
for (URI sharedEditsUri : sharedEditsDirs) {
StorageDirectory sharedEditsDir = newSharedStorage
.getStorageDirectory(sharedEditsUri);
File targetFile = new File(sharedEditsDir.getCurrentDir(),
editLogSegment.getName());
if (!targetFile.exists()) {
InputStream in = null;
OutputStream out = null;
try {
in = new FileInputStream(editLogSegment);
out = new AtomicFileOutputStream(targetFile);
IOUtils.copyBytes(in, out, conf);
} finally {
IOUtils.cleanup(LOG, in, out);
}
}
}
}
}
return false; // did not abort
}
private static boolean finalize(Configuration conf,
boolean isConfirmationNeeded

View File

@ -19,17 +19,22 @@
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.ha.ServiceFailedException;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HAUtil;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.test.GenericTestUtils;
import org.junit.After;
import org.junit.Before;
@ -48,7 +53,10 @@ public class TestInitializeSharedEdits {
@Before
public void setupCluster() throws IOException {
conf = new Configuration();
conf.setInt(DFSConfigKeys.DFS_HA_LOGROLL_PERIOD_KEY, 1);
conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
HAUtil.setAllowStandbyReads(conf, true);
MiniDFSNNTopology topology = MiniDFSNNTopology.simpleHATopology();
cluster = new MiniDFSCluster.Builder(conf)
@ -56,11 +64,8 @@ public void setupCluster() throws IOException {
.numDataNodes(0)
.build();
cluster.waitActive();
cluster.shutdownNameNode(0);
cluster.shutdownNameNode(1);
File sharedEditsDir = new File(cluster.getSharedEditsDir(0, 1));
assertTrue(FileUtil.fullyDelete(sharedEditsDir));
shutdownClusterAndRemoveSharedEditsDir();
}
@After
@ -70,8 +75,14 @@ public void shutdownCluster() throws IOException {
}
}
@Test
public void testInitializeSharedEdits() throws Exception {
private void shutdownClusterAndRemoveSharedEditsDir() throws IOException {
cluster.shutdownNameNode(0);
cluster.shutdownNameNode(1);
File sharedEditsDir = new File(cluster.getSharedEditsDir(0, 1));
assertTrue(FileUtil.fullyDelete(sharedEditsDir));
}
private void assertCannotStartNameNodes() {
// Make sure we can't currently start either NN.
try {
cluster.restartNameNode(0, false);
@ -89,24 +100,27 @@ public void testInitializeSharedEdits() throws Exception {
GenericTestUtils.assertExceptionContains(
"Cannot start an HA namenode with name dirs that need recovery", ioe);
}
// Initialize the shared edits dir.
assertFalse(NameNode.initializeSharedEdits(conf));
}
private void assertCanStartHaNameNodes(String pathSuffix)
throws ServiceFailedException, IOException, URISyntaxException,
InterruptedException {
// Now should be able to start both NNs. Pass "false" here so that we don't
// try to waitActive on all NNs, since the second NN doesn't exist yet.
cluster.restartNameNode(0, false);
cluster.restartNameNode(1, true);
// Make sure HA is working.
cluster.transitionToActive(0);
cluster.getNameNode(0).getRpcServer().transitionToActive();
FileSystem fs = null;
try {
Path newPath = new Path(TEST_PATH, pathSuffix);
fs = HATestUtil.configureFailoverFs(cluster, conf);
assertTrue(fs.mkdirs(TEST_PATH));
cluster.transitionToStandby(0);
cluster.transitionToActive(1);
assertTrue(fs.isDirectory(TEST_PATH));
assertTrue(fs.mkdirs(newPath));
HATestUtil.waitForStandbyToCatchUp(cluster.getNameNode(0),
cluster.getNameNode(1));
assertTrue(NameNodeAdapter.getFileInfo(cluster.getNameNode(1),
newPath.toString(), false).isDir());
} finally {
if (fs != null) {
fs.close();
@ -114,6 +128,29 @@ public void testInitializeSharedEdits() throws Exception {
}
}
@Test
public void testInitializeSharedEdits() throws Exception {
assertCannotStartNameNodes();
// Initialize the shared edits dir.
assertFalse(NameNode.initializeSharedEdits(cluster.getConfiguration(0)));
assertCanStartHaNameNodes("1");
// Now that we've done a metadata operation, make sure that deleting and
// re-initializing the shared edits dir will let the standby still start.
shutdownClusterAndRemoveSharedEditsDir();
assertCannotStartNameNodes();
// Re-initialize the shared edits dir.
assertFalse(NameNode.initializeSharedEdits(cluster.getConfiguration(0)));
// Should *still* be able to start both NNs
assertCanStartHaNameNodes("2");
}
@Test
public void testDontOverWriteExistingDir() {
assertFalse(NameNode.initializeSharedEdits(conf, false));