HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe)

Change-Id: Ib8420310e515bb98091de86ea5c4be354878d43c
This commit is contained in:
Colin Patrick Mccabe 2015-09-25 15:25:42 -07:00
parent d3c49e7662
commit 4e7c6a653f
2 changed files with 55 additions and 2 deletions

View File

@ -24,6 +24,7 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.StorageType;
@ -34,10 +35,13 @@
import org.apache.hadoop.hdfs.server.protocol.StorageReport;
import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.StopWatch;
import org.apache.hadoop.util.Time;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
/**
* Manage the heartbeats received from datanodes.
* The datanode list and statistics are synchronized
@ -62,7 +66,7 @@ class HeartbeatManager implements DatanodeStatistics {
private final long heartbeatRecheckInterval;
/** Heartbeat monitor thread */
private final Daemon heartbeatThread = new Daemon(new Monitor());
private final StopWatch heartbeatStopWatch = new StopWatch();
final Namesystem namesystem;
final BlockManager blockManager;
@ -261,6 +265,17 @@ synchronized void stopDecommission(final DatanodeDescriptor node) {
}
}
@VisibleForTesting
void restartHeartbeatStopWatch() {
heartbeatStopWatch.reset().start();
}
@VisibleForTesting
boolean shouldAbortHeartbeatCheck(long offset) {
long elapsed = heartbeatStopWatch.now(TimeUnit.MILLISECONDS);
return elapsed + offset > heartbeatRecheckInterval;
}
/**
* Check if there are any expired heartbeats, and if so,
* whether any blocks have to be re-replicated.
@ -307,6 +322,10 @@ void heartbeatCheck() {
int numOfStaleStorages = 0;
synchronized(this) {
for (DatanodeDescriptor d : datanodes) {
// check if an excessive GC pause has occurred
if (shouldAbortHeartbeatCheck(0)) {
return;
}
if (dead == null && dm.isDatanodeDead(d)) {
stats.incrExpiredHeartbeats();
dead = d;
@ -375,6 +394,7 @@ private class Monitor implements Runnable {
@Override
public void run() {
while(namesystem.isRunning()) {
restartHeartbeatStopWatch();
try {
final long now = Time.monotonicNow();
if (lastHeartbeatCheck + heartbeatRecheckInterval < now) {
@ -396,6 +416,12 @@ public void run() {
Thread.sleep(5000); // 5 seconds
} catch (InterruptedException ie) {
}
// avoid declaring nodes dead for another cycle if a GC pause lasts
// longer than the node recheck interval
if (shouldAbortHeartbeatCheck(-5000)) {
LOG.warn("Skipping next heartbeat scan due to excessive pause");
lastHeartbeatCheck = Time.monotonicNow();
}
}
}
}

View File

@ -18,6 +18,8 @@
package org.apache.hadoop.hdfs.server.blockmanagement;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
@ -33,6 +35,7 @@
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.hdfs.server.namenode.Namesystem;
import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
@ -40,6 +43,7 @@
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
import org.junit.Test;
import org.mockito.Mockito;
/**
* Test if FSNamesystem handles heartbeat right
@ -243,4 +247,27 @@ public void testHeartbeatBlockRecovery() throws Exception {
cluster.shutdown();
}
}
@Test
public void testHeartbeatStopWatch() throws Exception {
Namesystem ns = Mockito.mock(Namesystem.class);
BlockManager bm = Mockito.mock(BlockManager.class);
Configuration conf = new Configuration();
long recheck = 2000;
conf.setLong(
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY, recheck);
HeartbeatManager monitor = new HeartbeatManager(ns, bm, conf);
monitor.restartHeartbeatStopWatch();
assertFalse(monitor.shouldAbortHeartbeatCheck(0));
// sleep shorter than recheck and verify shouldn't abort
Thread.sleep(100);
assertFalse(monitor.shouldAbortHeartbeatCheck(0));
// sleep longer than recheck and verify should abort unless ignore delay
Thread.sleep(recheck);
assertTrue(monitor.shouldAbortHeartbeatCheck(0));
assertFalse(monitor.shouldAbortHeartbeatCheck(-recheck*3));
// ensure it resets properly
monitor.restartHeartbeatStopWatch();
assertFalse(monitor.shouldAbortHeartbeatCheck(0));
}
}