HDFS-4832. Namenode doesn't change the number of missing blocks in safemode when DNs rejoin or leave. Contributed by Ravi Prakash.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1490803 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a764da16d6
commit
2a76cddcd5
@ -3123,6 +3123,9 @@ Release 0.23.9 - UNRELEASED
|
|||||||
HDFS-4862. SafeModeInfo.isManual() returns true when resources are low even
|
HDFS-4862. SafeModeInfo.isManual() returns true when resources are low even
|
||||||
if it wasn't entered into manually (Ravi Prakash via kihwal)
|
if it wasn't entered into manually (Ravi Prakash via kihwal)
|
||||||
|
|
||||||
|
HDFS-4832. Namenode doesn't change the number of missing blocks in
|
||||||
|
safemode when DNs rejoin or leave (Ravi Prakash via kihwal)
|
||||||
|
|
||||||
Release 0.23.8 - 2013-06-05
|
Release 0.23.8 - 2013-06-05
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
@ -2156,7 +2156,7 @@ private Block addStoredBlock(final BlockInfo block,
|
|||||||
return storedBlock;
|
return storedBlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
// do not try to handle over/under-replicated blocks during safe mode
|
// do not try to handle over/under-replicated blocks during first safe mode
|
||||||
if (!namesystem.isPopulatingReplQueues()) {
|
if (!namesystem.isPopulatingReplQueues()) {
|
||||||
return storedBlock;
|
return storedBlock;
|
||||||
}
|
}
|
||||||
|
@ -1169,6 +1169,12 @@ public DatanodeCommand[] handleHeartbeat(DatanodeRegistration nodeReg,
|
|||||||
heartbeatManager.updateHeartbeat(nodeinfo, capacity, dfsUsed,
|
heartbeatManager.updateHeartbeat(nodeinfo, capacity, dfsUsed,
|
||||||
remaining, blockPoolUsed, xceiverCount, failedVolumes);
|
remaining, blockPoolUsed, xceiverCount, failedVolumes);
|
||||||
|
|
||||||
|
// If we are in safemode, do not send back any recovery / replication
|
||||||
|
// requests. Don't even drain the existing queue of work.
|
||||||
|
if(namesystem.isInSafeMode()) {
|
||||||
|
return new DatanodeCommand[0];
|
||||||
|
}
|
||||||
|
|
||||||
//check lease recovery
|
//check lease recovery
|
||||||
BlockInfoUnderConstruction[] blocks = nodeinfo
|
BlockInfoUnderConstruction[] blocks = nodeinfo
|
||||||
.getLeaseRecoveryCommand(Integer.MAX_VALUE);
|
.getLeaseRecoveryCommand(Integer.MAX_VALUE);
|
||||||
|
@ -223,7 +223,7 @@ void heartbeatCheck() {
|
|||||||
final DatanodeManager dm = blockManager.getDatanodeManager();
|
final DatanodeManager dm = blockManager.getDatanodeManager();
|
||||||
// It's OK to check safe mode w/o taking the lock here, we re-check
|
// It's OK to check safe mode w/o taking the lock here, we re-check
|
||||||
// for safe mode after taking the lock before removing a datanode.
|
// for safe mode after taking the lock before removing a datanode.
|
||||||
if (namesystem.isInSafeMode()) {
|
if (namesystem.isInStartupSafeMode()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
boolean allAlive = false;
|
boolean allAlive = false;
|
||||||
@ -252,7 +252,7 @@ void heartbeatCheck() {
|
|||||||
// acquire the fsnamesystem lock, and then remove the dead node.
|
// acquire the fsnamesystem lock, and then remove the dead node.
|
||||||
namesystem.writeLock();
|
namesystem.writeLock();
|
||||||
try {
|
try {
|
||||||
if (namesystem.isInSafeMode()) {
|
if (namesystem.isInStartupSafeMode()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
synchronized(this) {
|
synchronized(this) {
|
||||||
|
@ -4093,7 +4093,7 @@ private boolean shouldIncrementallyTrackBlocks() {
|
|||||||
*
|
*
|
||||||
* @see SafeModeInfo
|
* @see SafeModeInfo
|
||||||
*/
|
*/
|
||||||
private SafeModeInfo(boolean resourcesLow) {
|
private SafeModeInfo(boolean resourcesLow, boolean isReplQueuesInited) {
|
||||||
this.threshold = 1.5f; // this threshold can never be reached
|
this.threshold = 1.5f; // this threshold can never be reached
|
||||||
this.datanodeThreshold = Integer.MAX_VALUE;
|
this.datanodeThreshold = Integer.MAX_VALUE;
|
||||||
this.extension = Integer.MAX_VALUE;
|
this.extension = Integer.MAX_VALUE;
|
||||||
@ -4102,6 +4102,7 @@ private SafeModeInfo(boolean resourcesLow) {
|
|||||||
this.blockTotal = -1;
|
this.blockTotal = -1;
|
||||||
this.blockSafe = -1;
|
this.blockSafe = -1;
|
||||||
this.resourcesLow = resourcesLow;
|
this.resourcesLow = resourcesLow;
|
||||||
|
this.initializedReplQueues = isReplQueuesInited;
|
||||||
enter();
|
enter();
|
||||||
reportStatus("STATE* Safe mode is ON.", true);
|
reportStatus("STATE* Safe mode is ON.", true);
|
||||||
}
|
}
|
||||||
@ -4527,6 +4528,10 @@ public boolean isInStartupSafeMode() {
|
|||||||
&& safeMode.isOn();
|
&& safeMode.isOn();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if replication queues are to be populated
|
||||||
|
* @return true when node is HAState.Active and not in the very first safemode
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean isPopulatingReplQueues() {
|
public boolean isPopulatingReplQueues() {
|
||||||
if (!shouldPopulateReplQueues()) {
|
if (!shouldPopulateReplQueues()) {
|
||||||
@ -4657,7 +4662,7 @@ void enterSafeMode(boolean resourcesLow) throws IOException {
|
|||||||
getEditLog().logSyncAll();
|
getEditLog().logSyncAll();
|
||||||
}
|
}
|
||||||
if (!isInSafeMode()) {
|
if (!isInSafeMode()) {
|
||||||
safeMode = new SafeModeInfo(resourcesLow);
|
safeMode = new SafeModeInfo(resourcesLow, isPopulatingReplQueues());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (resourcesLow) {
|
if (resourcesLow) {
|
||||||
|
@ -34,9 +34,12 @@
|
|||||||
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
||||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||||
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
|
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.ha.HAState;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
|
import org.mockito.internal.util.reflection.Whitebox;
|
||||||
|
|
||||||
public class TestFSNamesystem {
|
public class TestFSNamesystem {
|
||||||
|
|
||||||
@ -104,4 +107,39 @@ public void testStartupSafemode() throws IOException {
|
|||||||
assertTrue("After entering safemode due to low resources FSNamesystem."
|
assertTrue("After entering safemode due to low resources FSNamesystem."
|
||||||
+ "isInSafeMode still returned false", fsn.isInSafeMode());
|
+ "isInSafeMode still returned false", fsn.isInSafeMode());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testReplQueuesActiveAfterStartupSafemode() throws IOException, InterruptedException{
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
|
||||||
|
FSEditLog fsEditLog = Mockito.mock(FSEditLog.class);
|
||||||
|
FSImage fsImage = Mockito.mock(FSImage.class);
|
||||||
|
Mockito.when(fsImage.getEditLog()).thenReturn(fsEditLog);
|
||||||
|
|
||||||
|
FSNamesystem fsNamesystem = new FSNamesystem(conf, fsImage);
|
||||||
|
FSNamesystem fsn = Mockito.spy(fsNamesystem);
|
||||||
|
|
||||||
|
//Make shouldPopulaeReplQueues return true
|
||||||
|
HAContext haContext = Mockito.mock(HAContext.class);
|
||||||
|
HAState haState = Mockito.mock(HAState.class);
|
||||||
|
Mockito.when(haContext.getState()).thenReturn(haState);
|
||||||
|
Mockito.when(haState.shouldPopulateReplQueues()).thenReturn(true);
|
||||||
|
Whitebox.setInternalState(fsn, "haContext", haContext);
|
||||||
|
|
||||||
|
//Make NameNode.getNameNodeMetrics() not return null
|
||||||
|
NameNode.initMetrics(conf, NamenodeRole.NAMENODE);
|
||||||
|
|
||||||
|
fsn.enterSafeMode(false);
|
||||||
|
assertTrue("FSNamesystem didn't enter safemode", fsn.isInSafeMode());
|
||||||
|
assertTrue("Replication queues were being populated during very first "
|
||||||
|
+ "safemode", !fsn.isPopulatingReplQueues());
|
||||||
|
fsn.leaveSafeMode();
|
||||||
|
assertTrue("FSNamesystem didn't leave safemode", !fsn.isInSafeMode());
|
||||||
|
assertTrue("Replication queues weren't being populated even after leaving "
|
||||||
|
+ "safemode", fsn.isPopulatingReplQueues());
|
||||||
|
fsn.enterSafeMode(false);
|
||||||
|
assertTrue("FSNamesystem didn't enter safemode", fsn.isInSafeMode());
|
||||||
|
assertTrue("Replication queues weren't being populated after entering "
|
||||||
|
+ "safemode 2nd time", fsn.isPopulatingReplQueues());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user