HDFS-4832. Namenode doesn't change the number of missing blocks in safemode when DNs rejoin or leave. Contributed by Ravi Prakash.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1490803 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Kihwal Lee 2013-06-07 20:01:55 +00:00
parent a764da16d6
commit 2a76cddcd5
6 changed files with 58 additions and 6 deletions

View File

@ -3123,6 +3123,9 @@ Release 0.23.9 - UNRELEASED
HDFS-4862. SafeModeInfo.isManual() returns true when resources are low even HDFS-4862. SafeModeInfo.isManual() returns true when resources are low even
if it wasn't entered into manually (Ravi Prakash via kihwal) if it wasn't entered into manually (Ravi Prakash via kihwal)
HDFS-4832. Namenode doesn't change the number of missing blocks in
safemode when DNs rejoin or leave (Ravi Prakash via kihwal)
Release 0.23.8 - 2013-06-05 Release 0.23.8 - 2013-06-05
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -2156,7 +2156,7 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
return storedBlock; return storedBlock;
} }
// do not try to handle over/under-replicated blocks during safe mode // do not try to handle over/under-replicated blocks during first safe mode
if (!namesystem.isPopulatingReplQueues()) { if (!namesystem.isPopulatingReplQueues()) {
return storedBlock; return storedBlock;
} }

View File

@ -1169,6 +1169,12 @@ public class DatanodeManager {
heartbeatManager.updateHeartbeat(nodeinfo, capacity, dfsUsed, heartbeatManager.updateHeartbeat(nodeinfo, capacity, dfsUsed,
remaining, blockPoolUsed, xceiverCount, failedVolumes); remaining, blockPoolUsed, xceiverCount, failedVolumes);
// If we are in safemode, do not send back any recovery / replication
// requests. Don't even drain the existing queue of work.
if(namesystem.isInSafeMode()) {
return new DatanodeCommand[0];
}
//check lease recovery //check lease recovery
BlockInfoUnderConstruction[] blocks = nodeinfo BlockInfoUnderConstruction[] blocks = nodeinfo
.getLeaseRecoveryCommand(Integer.MAX_VALUE); .getLeaseRecoveryCommand(Integer.MAX_VALUE);

View File

@ -223,7 +223,7 @@ class HeartbeatManager implements DatanodeStatistics {
final DatanodeManager dm = blockManager.getDatanodeManager(); final DatanodeManager dm = blockManager.getDatanodeManager();
// It's OK to check safe mode w/o taking the lock here, we re-check // It's OK to check safe mode w/o taking the lock here, we re-check
// for safe mode after taking the lock before removing a datanode. // for safe mode after taking the lock before removing a datanode.
if (namesystem.isInSafeMode()) { if (namesystem.isInStartupSafeMode()) {
return; return;
} }
boolean allAlive = false; boolean allAlive = false;
@ -252,7 +252,7 @@ class HeartbeatManager implements DatanodeStatistics {
// acquire the fsnamesystem lock, and then remove the dead node. // acquire the fsnamesystem lock, and then remove the dead node.
namesystem.writeLock(); namesystem.writeLock();
try { try {
if (namesystem.isInSafeMode()) { if (namesystem.isInStartupSafeMode()) {
return; return;
} }
synchronized(this) { synchronized(this) {

View File

@ -4093,7 +4093,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
* *
* @see SafeModeInfo * @see SafeModeInfo
*/ */
private SafeModeInfo(boolean resourcesLow) { private SafeModeInfo(boolean resourcesLow, boolean isReplQueuesInited) {
this.threshold = 1.5f; // this threshold can never be reached this.threshold = 1.5f; // this threshold can never be reached
this.datanodeThreshold = Integer.MAX_VALUE; this.datanodeThreshold = Integer.MAX_VALUE;
this.extension = Integer.MAX_VALUE; this.extension = Integer.MAX_VALUE;
@ -4102,6 +4102,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
this.blockTotal = -1; this.blockTotal = -1;
this.blockSafe = -1; this.blockSafe = -1;
this.resourcesLow = resourcesLow; this.resourcesLow = resourcesLow;
this.initializedReplQueues = isReplQueuesInited;
enter(); enter();
reportStatus("STATE* Safe mode is ON.", true); reportStatus("STATE* Safe mode is ON.", true);
} }
@ -4527,6 +4528,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
&& safeMode.isOn(); && safeMode.isOn();
} }
/**
* Check if replication queues are to be populated
* @return true when node is HAState.Active and not in the very first safemode
*/
@Override @Override
public boolean isPopulatingReplQueues() { public boolean isPopulatingReplQueues() {
if (!shouldPopulateReplQueues()) { if (!shouldPopulateReplQueues()) {
@ -4657,7 +4662,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
getEditLog().logSyncAll(); getEditLog().logSyncAll();
} }
if (!isInSafeMode()) { if (!isInSafeMode()) {
safeMode = new SafeModeInfo(resourcesLow); safeMode = new SafeModeInfo(resourcesLow, isPopulatingReplQueues());
return; return;
} }
if (resourcesLow) { if (resourcesLow) {

View File

@ -34,9 +34,12 @@ import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
import org.apache.hadoop.hdfs.server.namenode.ha.HAState;
import org.junit.After; import org.junit.After;
import org.junit.Test; import org.junit.Test;
import org.mockito.Mockito; import org.mockito.Mockito;
import org.mockito.internal.util.reflection.Whitebox;
public class TestFSNamesystem { public class TestFSNamesystem {
@ -104,4 +107,39 @@ public class TestFSNamesystem {
assertTrue("After entering safemode due to low resources FSNamesystem." assertTrue("After entering safemode due to low resources FSNamesystem."
+ "isInSafeMode still returned false", fsn.isInSafeMode()); + "isInSafeMode still returned false", fsn.isInSafeMode());
} }
@Test
public void testReplQueuesActiveAfterStartupSafemode() throws IOException, InterruptedException{
Configuration conf = new Configuration();
FSEditLog fsEditLog = Mockito.mock(FSEditLog.class);
FSImage fsImage = Mockito.mock(FSImage.class);
Mockito.when(fsImage.getEditLog()).thenReturn(fsEditLog);
FSNamesystem fsNamesystem = new FSNamesystem(conf, fsImage);
FSNamesystem fsn = Mockito.spy(fsNamesystem);
//Make shouldPopulaeReplQueues return true
HAContext haContext = Mockito.mock(HAContext.class);
HAState haState = Mockito.mock(HAState.class);
Mockito.when(haContext.getState()).thenReturn(haState);
Mockito.when(haState.shouldPopulateReplQueues()).thenReturn(true);
Whitebox.setInternalState(fsn, "haContext", haContext);
//Make NameNode.getNameNodeMetrics() not return null
NameNode.initMetrics(conf, NamenodeRole.NAMENODE);
fsn.enterSafeMode(false);
assertTrue("FSNamesystem didn't enter safemode", fsn.isInSafeMode());
assertTrue("Replication queues were being populated during very first "
+ "safemode", !fsn.isPopulatingReplQueues());
fsn.leaveSafeMode();
assertTrue("FSNamesystem didn't leave safemode", !fsn.isInSafeMode());
assertTrue("Replication queues weren't being populated even after leaving "
+ "safemode", fsn.isPopulatingReplQueues());
fsn.enterSafeMode(false);
assertTrue("FSNamesystem didn't enter safemode", fsn.isInSafeMode());
assertTrue("Replication queues weren't being populated after entering "
+ "safemode 2nd time", fsn.isPopulatingReplQueues());
}
} }