HDFS-4888. Refactor and fix FSNamesystem.getTurnOffTip. Contributed by Ravi Prakash.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1498665 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
aa7e148d1f
commit
ead7fa0413
@ -637,6 +637,9 @@ Release 2.1.0-beta - 2013-07-02
|
||||
HDFS-4944. WebHDFS cannot create a file path containing characters that must
|
||||
be URI-encoded, such as space. (cnauroth)
|
||||
|
||||
HDFS-4888. Refactor and fix FSNamesystem.getTurnOffTip. (Ravi Prakash via
|
||||
kihwal)
|
||||
|
||||
BREAKDOWN OF HDFS-347 SUBTASKS AND RELATED JIRAS
|
||||
|
||||
HDFS-4353. Encapsulate connections to peers in Peer and PeerServer classes.
|
||||
|
@ -4031,9 +4031,9 @@ class SafeModeInfo {
|
||||
|
||||
// internal fields
|
||||
/** Time when threshold was reached.
|
||||
*
|
||||
* <br>-1 safe mode is off
|
||||
* <br> 0 safe mode is on, but threshold is not reached yet
|
||||
* <br> -1 safe mode is off
|
||||
* <br> 0 safe mode is on, and threshold is not reached yet
|
||||
* <br> >0 safe mode is on, but we are in extension period
|
||||
*/
|
||||
private long reached = -1;
|
||||
/** Total number of blocks. */
|
||||
@ -4157,7 +4157,8 @@ private synchronized void leave() {
|
||||
NameNode.stateChangeLog.info("STATE* Leaving safe mode after "
|
||||
+ timeInSafemode/1000 + " secs");
|
||||
NameNode.getNameNodeMetrics().setSafeModeTime((int) timeInSafemode);
|
||||
|
||||
|
||||
//Log the following only once (when transitioning from ON -> OFF)
|
||||
if (reached >= 0) {
|
||||
NameNode.stateChangeLog.info("STATE* Safe mode is OFF");
|
||||
}
|
||||
@ -4338,62 +4339,56 @@ private void setResourcesLow() {
|
||||
* A tip on how safe mode is to be turned off: manually or automatically.
|
||||
*/
|
||||
String getTurnOffTip() {
|
||||
if(reached < 0)
|
||||
if(!isOn())
|
||||
return "Safe mode is OFF.";
|
||||
String leaveMsg = "";
|
||||
|
||||
//Manual OR low-resource safemode. (Admin intervention required)
|
||||
String leaveMsg = "It was turned on manually. ";
|
||||
if (areResourcesLow()) {
|
||||
leaveMsg = "Resources are low on NN. "
|
||||
+ "Please add or free up more resources then turn off safe mode manually. "
|
||||
+ "NOTE: If you turn off safe mode before adding resources, "
|
||||
+ "the NN will immediately return to safe mode.";
|
||||
} else {
|
||||
leaveMsg = "Safe mode will be turned off automatically";
|
||||
leaveMsg = "Resources are low on NN. Please add or free up more "
|
||||
+ "resources then turn off safe mode manually. NOTE: If you turn off"
|
||||
+ " safe mode before adding resources, "
|
||||
+ "the NN will immediately return to safe mode. ";
|
||||
}
|
||||
if(isManual() && !areResourcesLow()) {
|
||||
leaveMsg = "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off";
|
||||
if (isManual() || areResourcesLow()) {
|
||||
return leaveMsg
|
||||
+ "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off.";
|
||||
}
|
||||
|
||||
if(blockTotal < 0)
|
||||
return leaveMsg + ".";
|
||||
|
||||
//Automatic safemode. System will come out of safemode automatically.
|
||||
leaveMsg = "Safe mode will be turned off automatically";
|
||||
int numLive = getNumLiveDataNodes();
|
||||
String msg = "";
|
||||
if (reached == 0) {
|
||||
if (blockSafe < blockThreshold) {
|
||||
msg += String.format(
|
||||
"The reported blocks %d needs additional %d"
|
||||
+ " blocks to reach the threshold %.4f of total blocks %d.",
|
||||
+ " blocks to reach the threshold %.4f of total blocks %d.\n",
|
||||
blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal);
|
||||
}
|
||||
if (numLive < datanodeThreshold) {
|
||||
if (!"".equals(msg)) {
|
||||
msg += "\n";
|
||||
}
|
||||
msg += String.format(
|
||||
"The number of live datanodes %d needs an additional %d live "
|
||||
+ "datanodes to reach the minimum number %d.",
|
||||
+ "datanodes to reach the minimum number %d.\n",
|
||||
numLive, (datanodeThreshold - numLive), datanodeThreshold);
|
||||
}
|
||||
msg += " " + leaveMsg;
|
||||
} else {
|
||||
msg = String.format("The reported blocks %d has reached the threshold"
|
||||
+ " %.4f of total blocks %d.", blockSafe, threshold,
|
||||
blockTotal);
|
||||
+ " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal);
|
||||
|
||||
if (datanodeThreshold > 0) {
|
||||
msg += String.format(" The number of live datanodes %d has reached "
|
||||
+ "the minimum number %d.",
|
||||
msg += String.format("The number of live datanodes %d has reached "
|
||||
+ "the minimum number %d. ",
|
||||
numLive, datanodeThreshold);
|
||||
}
|
||||
msg += " " + leaveMsg;
|
||||
}
|
||||
msg += leaveMsg;
|
||||
// threshold is not reached or manual or resources low
|
||||
if(reached == 0 || (isManual() && !areResourcesLow())) {
|
||||
return msg + ".";
|
||||
return msg;
|
||||
}
|
||||
// extension period is in progress
|
||||
return msg + " in " + Math.abs(reached + extension - now()) / 1000
|
||||
+ " seconds.";
|
||||
return msg + (reached + extension - now() > 0 ?
|
||||
" in " + (reached + extension - now()) / 1000 + " seconds."
|
||||
: " soon.");
|
||||
}
|
||||
|
||||
/**
|
||||
@ -5648,7 +5643,7 @@ public long getTotal() {
|
||||
public String getSafemode() {
|
||||
if (!this.isInSafeMode())
|
||||
return "";
|
||||
return "Safe mode is ON." + this.getSafeModeTip();
|
||||
return "Safe mode is ON. " + this.getSafeModeTip();
|
||||
}
|
||||
|
||||
@Override // NameNodeMXBean
|
||||
|
@ -178,9 +178,9 @@ public void testInitializeReplQueuesEarly() throws Exception {
|
||||
final NameNode nn = cluster.getNameNode();
|
||||
|
||||
String status = nn.getNamesystem().getSafemode();
|
||||
assertEquals("Safe mode is ON.The reported blocks 0 needs additional " +
|
||||
"15 blocks to reach the threshold 0.9990 of total blocks 15. " +
|
||||
"Safe mode will be turned off automatically.", status);
|
||||
assertEquals("Safe mode is ON. The reported blocks 0 needs additional " +
|
||||
"15 blocks to reach the threshold 0.9990 of total blocks 15.\n" +
|
||||
"Safe mode will be turned off automatically", status);
|
||||
assertFalse("Mis-replicated block queues should not be initialized " +
|
||||
"until threshold is crossed",
|
||||
NameNodeAdapter.safeModeInitializedReplQueues(nn));
|
||||
@ -353,10 +353,10 @@ public void testDatanodeThreshold() throws IOException {
|
||||
fs = cluster.getFileSystem();
|
||||
|
||||
String tipMsg = cluster.getNamesystem().getSafemode();
|
||||
assertTrue("Safemode tip message looks right: " + tipMsg,
|
||||
assertTrue("Safemode tip message doesn't look right: " + tipMsg,
|
||||
tipMsg.contains("The number of live datanodes 0 needs an additional " +
|
||||
"1 live datanodes to reach the minimum number 1. " +
|
||||
"Safe mode will be turned off automatically."));
|
||||
"1 live datanodes to reach the minimum number 1.\n" +
|
||||
"Safe mode will be turned off automatically"));
|
||||
|
||||
// Start a datanode
|
||||
cluster.startDataNodes(conf, 1, true, null, null);
|
||||
|
@ -206,11 +206,11 @@ public void testBlocksAddedBeforeStandbyRestart() throws Exception {
|
||||
// We expect it not to be stuck in safemode, since those blocks
|
||||
// that are already visible to the SBN should be processed
|
||||
// in the initial block reports.
|
||||
assertSafeMode(nn1, 3, 3);
|
||||
assertSafeMode(nn1, 3, 3, 3, 0);
|
||||
|
||||
banner("Waiting for standby to catch up to active namespace");
|
||||
HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
|
||||
assertSafeMode(nn1, 8, 8);
|
||||
assertSafeMode(nn1, 8, 8, 3, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -230,7 +230,7 @@ public void testBlocksAddedWhileInSafeMode() throws Exception {
|
||||
banner("Restarting standby");
|
||||
restartStandby();
|
||||
|
||||
assertSafeMode(nn1, 3, 3);
|
||||
assertSafeMode(nn1, 3, 3, 3, 0);
|
||||
|
||||
// Create a few blocks which will send blockReceived calls to the
|
||||
// SBN.
|
||||
@ -241,7 +241,7 @@ public void testBlocksAddedWhileInSafeMode() throws Exception {
|
||||
banner("Waiting for standby to catch up to active namespace");
|
||||
HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
|
||||
|
||||
assertSafeMode(nn1, 8, 8);
|
||||
assertSafeMode(nn1, 8, 8, 3, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -281,11 +281,11 @@ public void testBlocksRemovedBeforeStandbyRestart() throws Exception {
|
||||
|
||||
banner("Restarting standby");
|
||||
restartStandby();
|
||||
assertSafeMode(nn1, 0, 5);
|
||||
assertSafeMode(nn1, 0, 5, 3, 0);
|
||||
|
||||
banner("Waiting for standby to catch up to active namespace");
|
||||
HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
|
||||
assertSafeMode(nn1, 0, 0);
|
||||
assertSafeMode(nn1, 0, 0, 3, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -307,7 +307,7 @@ public void testBlocksRemovedWhileInSafeMode() throws Exception {
|
||||
restartStandby();
|
||||
|
||||
// It will initially have all of the blocks necessary.
|
||||
assertSafeMode(nn1, 10, 10);
|
||||
assertSafeMode(nn1, 10, 10, 3, 0);
|
||||
|
||||
// Delete those blocks while the SBN is in safe mode.
|
||||
// This doesn't affect the SBN, since deletions are not
|
||||
@ -322,14 +322,14 @@ public void testBlocksRemovedWhileInSafeMode() throws Exception {
|
||||
HATestUtil.waitForDNDeletions(cluster);
|
||||
cluster.triggerDeletionReports();
|
||||
|
||||
assertSafeMode(nn1, 10, 10);
|
||||
assertSafeMode(nn1, 10, 10, 3, 0);
|
||||
|
||||
// When we catch up to active namespace, it will restore back
|
||||
// to 0 blocks.
|
||||
banner("Waiting for standby to catch up to active namespace");
|
||||
HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
|
||||
|
||||
assertSafeMode(nn1, 0, 0);
|
||||
assertSafeMode(nn1, 0, 0, 3, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -355,20 +355,20 @@ public void testAppendWhileInSafeMode() throws Exception {
|
||||
restartStandby();
|
||||
|
||||
// It will initially have all of the blocks necessary.
|
||||
assertSafeMode(nn1, 5, 5);
|
||||
assertSafeMode(nn1, 5, 5, 3, 0);
|
||||
|
||||
// Append to a block while SBN is in safe mode. This should
|
||||
// not affect safemode initially, since the DN message
|
||||
// will get queued.
|
||||
FSDataOutputStream stm = fs.append(new Path("/test"));
|
||||
try {
|
||||
assertSafeMode(nn1, 5, 5);
|
||||
assertSafeMode(nn1, 5, 5, 3, 0);
|
||||
|
||||
// if we roll edits now, the SBN should see that it's under construction
|
||||
// and change its total count and safe count down by one, since UC
|
||||
// blocks are not counted by safe mode.
|
||||
HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
|
||||
assertSafeMode(nn1, 4, 4);
|
||||
assertSafeMode(nn1, 4, 4, 3, 0);
|
||||
} finally {
|
||||
IOUtils.closeStream(stm);
|
||||
}
|
||||
@ -386,13 +386,13 @@ public void testAppendWhileInSafeMode() throws Exception {
|
||||
HATestUtil.waitForDNDeletions(cluster);
|
||||
cluster.triggerDeletionReports();
|
||||
|
||||
assertSafeMode(nn1, 4, 4);
|
||||
assertSafeMode(nn1, 4, 4, 3, 0);
|
||||
|
||||
// When we roll the edit log, the deletions will go through.
|
||||
banner("Waiting for standby to catch up to active namespace");
|
||||
HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
|
||||
|
||||
assertSafeMode(nn1, 0, 0);
|
||||
assertSafeMode(nn1, 0, 0, 3, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -424,20 +424,21 @@ public void testBlocksDeletedInEditLog() throws Exception {
|
||||
restartActive();
|
||||
}
|
||||
|
||||
private static void assertSafeMode(NameNode nn, int safe, int total) {
|
||||
private static void assertSafeMode(NameNode nn, int safe, int total,
|
||||
int numNodes, int nodeThresh) {
|
||||
String status = nn.getNamesystem().getSafemode();
|
||||
if (safe == total) {
|
||||
assertTrue("Bad safemode status: '" + status + "'",
|
||||
status.startsWith(
|
||||
"Safe mode is ON." +
|
||||
"The reported blocks " + safe + " has reached the threshold " +
|
||||
"0.9990 of total blocks " + total + ". Safe mode will be " +
|
||||
"turned off automatically"));
|
||||
"Safe mode is ON. The reported blocks " + safe + " has reached the "
|
||||
+ "threshold 0.9990 of total blocks " + total + ". The number of "
|
||||
+ "live datanodes " + numNodes + " has reached the minimum number "
|
||||
+ nodeThresh + ". Safe mode will be turned off automatically"));
|
||||
} else {
|
||||
int additional = total - safe;
|
||||
assertTrue("Bad safemode status: '" + status + "'",
|
||||
status.startsWith(
|
||||
"Safe mode is ON." +
|
||||
"Safe mode is ON. " +
|
||||
"The reported blocks " + safe + " needs additional " +
|
||||
additional + " blocks"));
|
||||
}
|
||||
@ -467,14 +468,14 @@ public void testComplexFailoverIntoSafemode() throws Exception {
|
||||
|
||||
// We expect it to be on its way out of safemode, since all of the blocks
|
||||
// from the edit log have been reported.
|
||||
assertSafeMode(nn1, 3, 3);
|
||||
assertSafeMode(nn1, 3, 3, 3, 0);
|
||||
|
||||
// Initiate a failover into it while it's in safemode
|
||||
banner("Initiating a failover into NN1 in safemode");
|
||||
NameNodeAdapter.abortEditLogs(nn0);
|
||||
cluster.transitionToActive(1);
|
||||
|
||||
assertSafeMode(nn1, 5, 5);
|
||||
assertSafeMode(nn1, 5, 5, 3, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -499,10 +500,11 @@ public void testBlocksRemovedWhileInSafeModeEditsArriveFirst() throws Exception
|
||||
// It will initially have all of the blocks necessary.
|
||||
String status = nn1.getNamesystem().getSafemode();
|
||||
assertTrue("Bad safemode status: '" + status + "'",
|
||||
status.startsWith(
|
||||
"Safe mode is ON." +
|
||||
"The reported blocks 10 has reached the threshold 0.9990 of " +
|
||||
"total blocks 10. Safe mode will be turned off automatically"));
|
||||
status.startsWith(
|
||||
"Safe mode is ON. The reported blocks 10 has reached the threshold "
|
||||
+ "0.9990 of total blocks 10. The number of live datanodes 3 has "
|
||||
+ "reached the minimum number 0. Safe mode will be turned off "
|
||||
+ "automatically"));
|
||||
|
||||
// Delete those blocks while the SBN is in safe mode.
|
||||
// Immediately roll the edit log before the actual deletions are sent
|
||||
@ -512,7 +514,7 @@ public void testBlocksRemovedWhileInSafeModeEditsArriveFirst() throws Exception
|
||||
HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
|
||||
|
||||
// Should see removal of the blocks as well as their contribution to safe block count.
|
||||
assertSafeMode(nn1, 0, 0);
|
||||
assertSafeMode(nn1, 0, 0, 3, 0);
|
||||
|
||||
|
||||
banner("Triggering sending deletions to DNs and Deletion Reports");
|
||||
@ -525,7 +527,7 @@ public void testBlocksRemovedWhileInSafeModeEditsArriveFirst() throws Exception
|
||||
// No change in assertion status here, but some of the consistency checks
|
||||
// in safemode will fire here if we accidentally decrement safe block count
|
||||
// below 0.
|
||||
assertSafeMode(nn1, 0, 0);
|
||||
assertSafeMode(nn1, 0, 0, 3, 0);
|
||||
}
|
||||
|
||||
|
||||
@ -561,11 +563,11 @@ public void testSafeBlockTracking() throws Exception {
|
||||
|
||||
banner("Restarting SBN");
|
||||
restartStandby();
|
||||
assertSafeMode(nn1, 10, 10);
|
||||
assertSafeMode(nn1, 10, 10, 3, 0);
|
||||
|
||||
banner("Allowing SBN to catch up");
|
||||
HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
|
||||
assertSafeMode(nn1, 15, 15);
|
||||
assertSafeMode(nn1, 15, 15, 3, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -593,7 +595,7 @@ public void testBlocksAddedWhileStandbyIsDown() throws Exception {
|
||||
nn0.getRpcServer().rollEditLog();
|
||||
|
||||
restartStandby();
|
||||
assertSafeMode(nn1, 6, 6);
|
||||
assertSafeMode(nn1, 6, 6, 3, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
|
Loading…
Reference in New Issue
Block a user