HDFS-16907. Add LastHeartbeatResponseTime for BP service actor (#5349)
Reviewed-by: Ayush Saxena <ayushsaxena@apache.org> Reviewed-by: Shilun Fan <slfan1989@apache.org> Signed-off-by: Tao Li <tomscut@apache.org>
This commit is contained in:
parent
a6a9fe17e0
commit
f02c452cf1
@ -207,6 +207,8 @@ Map<String, String> getActorInfoMap() {
|
|||||||
info.put("ActorState", getRunningState());
|
info.put("ActorState", getRunningState());
|
||||||
info.put("LastHeartbeat",
|
info.put("LastHeartbeat",
|
||||||
String.valueOf(getScheduler().getLastHearbeatTime()));
|
String.valueOf(getScheduler().getLastHearbeatTime()));
|
||||||
|
info.put("LastHeartbeatResponseTime",
|
||||||
|
String.valueOf(getScheduler().getLastHeartbeatResponseTime()));
|
||||||
info.put("LastBlockReport",
|
info.put("LastBlockReport",
|
||||||
String.valueOf(getScheduler().getLastBlockReportTime()));
|
String.valueOf(getScheduler().getLastBlockReportTime()));
|
||||||
info.put("maxBlockReportSize", String.valueOf(getMaxBlockReportSize()));
|
info.put("maxBlockReportSize", String.valueOf(getMaxBlockReportSize()));
|
||||||
@ -580,6 +582,8 @@ HeartbeatResponse sendHeartBeat(boolean requestBlockReportLease)
|
|||||||
slowPeers,
|
slowPeers,
|
||||||
slowDisks);
|
slowDisks);
|
||||||
|
|
||||||
|
scheduler.updateLastHeartbeatResponseTime(monotonicNow());
|
||||||
|
|
||||||
if (outliersReportDue) {
|
if (outliersReportDue) {
|
||||||
// If the report was due and successfully sent, schedule the next one.
|
// If the report was due and successfully sent, schedule the next one.
|
||||||
scheduler.scheduleNextOutlierReport();
|
scheduler.scheduleNextOutlierReport();
|
||||||
@ -1202,6 +1206,9 @@ static class Scheduler {
|
|||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
volatile long lastHeartbeatTime = monotonicNow();
|
volatile long lastHeartbeatTime = monotonicNow();
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
private volatile long lastHeartbeatResponseTime = -1;
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
boolean resetBlockReportTime = true;
|
boolean resetBlockReportTime = true;
|
||||||
|
|
||||||
@ -1250,6 +1257,10 @@ void updateLastHeartbeatTime(long heartbeatTime) {
|
|||||||
lastHeartbeatTime = heartbeatTime;
|
lastHeartbeatTime = heartbeatTime;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void updateLastHeartbeatResponseTime(long heartbeatTime) {
|
||||||
|
this.lastHeartbeatResponseTime = heartbeatTime;
|
||||||
|
}
|
||||||
|
|
||||||
void updateLastBlockReportTime(long blockReportTime) {
|
void updateLastBlockReportTime(long blockReportTime) {
|
||||||
lastBlockReportTime = blockReportTime;
|
lastBlockReportTime = blockReportTime;
|
||||||
}
|
}
|
||||||
@ -1262,6 +1273,10 @@ long getLastHearbeatTime() {
|
|||||||
return (monotonicNow() - lastHeartbeatTime)/1000;
|
return (monotonicNow() - lastHeartbeatTime)/1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private long getLastHeartbeatResponseTime() {
|
||||||
|
return (monotonicNow() - lastHeartbeatResponseTime) / 1000;
|
||||||
|
}
|
||||||
|
|
||||||
long getLastBlockReportTime() {
|
long getLastBlockReportTime() {
|
||||||
return (monotonicNow() - lastBlockReportTime)/1000;
|
return (monotonicNow() - lastBlockReportTime)/1000;
|
||||||
}
|
}
|
||||||
|
@ -3621,8 +3621,12 @@ public String getDatanodeHostname() {
|
|||||||
*/
|
*/
|
||||||
@Override // DataNodeMXBean
|
@Override // DataNodeMXBean
|
||||||
public String getBPServiceActorInfo() {
|
public String getBPServiceActorInfo() {
|
||||||
final ArrayList<Map<String, String>> infoArray =
|
return JSON.toString(getBPServiceActorInfoMap());
|
||||||
new ArrayList<Map<String, String>>();
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public List<Map<String, String>> getBPServiceActorInfoMap() {
|
||||||
|
final List<Map<String, String>> infoArray = new ArrayList<>();
|
||||||
for (BPOfferService bpos : blockPoolManager.getAllNamenodeThreads()) {
|
for (BPOfferService bpos : blockPoolManager.getAllNamenodeThreads()) {
|
||||||
if (bpos != null) {
|
if (bpos != null) {
|
||||||
for (BPServiceActor actor : bpos.getBPServiceActors()) {
|
for (BPServiceActor actor : bpos.getBPServiceActors()) {
|
||||||
@ -3630,7 +3634,7 @@ public String getBPServiceActorInfo() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return JSON.toString(infoArray);
|
return infoArray;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -3825,6 +3829,29 @@ boolean isRestarting() {
|
|||||||
* @return true - if the data node is fully started
|
* @return true - if the data node is fully started
|
||||||
*/
|
*/
|
||||||
public boolean isDatanodeFullyStarted() {
|
public boolean isDatanodeFullyStarted() {
|
||||||
|
return isDatanodeFullyStarted(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A datanode is considered to be fully started if all the BP threads are
|
||||||
|
* alive and all the block pools are initialized. If checkConnectionToActiveNamenode is true,
|
||||||
|
* the datanode is considered to be fully started if it is also heartbeating to
|
||||||
|
* active namenode in addition to the above-mentioned conditions.
|
||||||
|
*
|
||||||
|
* @param checkConnectionToActiveNamenode if true, performs additional check of whether datanode
|
||||||
|
* is heartbeating to active namenode.
|
||||||
|
* @return true if the datanode is fully started and also conditionally connected to active
|
||||||
|
* namenode, false otherwise.
|
||||||
|
*/
|
||||||
|
public boolean isDatanodeFullyStarted(boolean checkConnectionToActiveNamenode) {
|
||||||
|
if (checkConnectionToActiveNamenode) {
|
||||||
|
for (BPOfferService bp : blockPoolManager.getAllNamenodeThreads()) {
|
||||||
|
if (!bp.isInitialized() || !bp.isAlive() || bp.getActiveNN() == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
for (BPOfferService bp : blockPoolManager.getAllNamenodeThreads()) {
|
for (BPOfferService bp : blockPoolManager.getAllNamenodeThreads()) {
|
||||||
if (!bp.isInitialized() || !bp.isAlive()) {
|
if (!bp.isInitialized() || !bp.isAlive()) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -84,7 +84,8 @@
|
|||||||
<th>Namenode HA State</th>
|
<th>Namenode HA State</th>
|
||||||
<th>Block Pool ID</th>
|
<th>Block Pool ID</th>
|
||||||
<th>Actor State</th>
|
<th>Actor State</th>
|
||||||
<th>Last Heartbeat</th>
|
<th>Last Heartbeat Sent</th>
|
||||||
|
<th>Last Heartbeat Response</th>
|
||||||
<th>Last Block Report</th>
|
<th>Last Block Report</th>
|
||||||
<th>Last Block Report Size (Max Size)</th>
|
<th>Last Block Report Size (Max Size)</th>
|
||||||
</tr>
|
</tr>
|
||||||
@ -96,6 +97,7 @@
|
|||||||
<td>{BlockPoolID}</td>
|
<td>{BlockPoolID}</td>
|
||||||
<td>{ActorState}</td>
|
<td>{ActorState}</td>
|
||||||
<td>{LastHeartbeat}s</td>
|
<td>{LastHeartbeat}s</td>
|
||||||
|
<td>{LastHeartbeatResponseTime}s</td>
|
||||||
<td>{#helper_relative_time value="{LastBlockReport}"/}</td>
|
<td>{#helper_relative_time value="{LastBlockReport}"/}</td>
|
||||||
<td>{maxBlockReportSize|fmt_bytes} ({maxDataLength|fmt_bytes})</td>
|
<td>{maxBlockReportSize|fmt_bytes} ({maxDataLength|fmt_bytes})</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
@ -2529,6 +2529,24 @@ public boolean restartDataNode(DataNodeProperties dnprop) throws IOException {
|
|||||||
return restartDataNode(dnprop, false);
|
return restartDataNode(dnprop, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wait for the datanode to be fully functional i.e. all the BP service threads are alive,
|
||||||
|
* all block pools initiated and also connected to active namenode.
|
||||||
|
*
|
||||||
|
* @param dn Datanode instance.
|
||||||
|
* @param timeout Timeout in millis until when we should wait for datanode to be fully
|
||||||
|
* operational.
|
||||||
|
* @throws InterruptedException If the thread wait is interrupted.
|
||||||
|
* @throws TimeoutException If times out while awaiting the fully operational capability of
|
||||||
|
* datanode.
|
||||||
|
*/
|
||||||
|
public void waitDatanodeConnectedToActive(DataNode dn, int timeout)
|
||||||
|
throws InterruptedException, TimeoutException {
|
||||||
|
GenericTestUtils.waitFor(() -> dn.isDatanodeFullyStarted(true),
|
||||||
|
100, timeout, "Datanode is not connected to active namenode even after "
|
||||||
|
+ timeout + " ms of waiting");
|
||||||
|
}
|
||||||
|
|
||||||
public void waitDatanodeFullyStarted(DataNode dn, int timeout)
|
public void waitDatanodeFullyStarted(DataNode dn, int timeout)
|
||||||
throws TimeoutException, InterruptedException {
|
throws TimeoutException, InterruptedException {
|
||||||
GenericTestUtils.waitFor(dn::isDatanodeFullyStarted, 100, timeout,
|
GenericTestUtils.waitFor(dn::isDatanodeFullyStarted, 100, timeout,
|
||||||
|
@ -38,7 +38,9 @@
|
|||||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||||
import org.apache.hadoop.hdfs.DFSTestUtil;
|
import org.apache.hadoop.hdfs.DFSTestUtil;
|
||||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||||
|
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
|
||||||
import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferTestCase;
|
import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferTestCase;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||||
import org.apache.hadoop.security.UserGroupInformation;
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
import org.apache.hadoop.test.GenericTestUtils;
|
import org.apache.hadoop.test.GenericTestUtils;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
@ -294,4 +296,81 @@ public void testDataNodeMXBeanSlowDisksEnabled() throws Exception {
|
|||||||
if (cluster != null) {cluster.shutdown();}
|
if (cluster != null) {cluster.shutdown();}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDataNodeMXBeanLastHeartbeats() throws Exception {
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
try (MiniDFSCluster cluster = new MiniDFSCluster
|
||||||
|
.Builder(conf)
|
||||||
|
.nnTopology(MiniDFSNNTopology.simpleHATopology(2))
|
||||||
|
.build()) {
|
||||||
|
cluster.waitActive();
|
||||||
|
cluster.transitionToActive(0);
|
||||||
|
cluster.transitionToStandby(1);
|
||||||
|
|
||||||
|
DataNode datanode = cluster.getDataNodes().get(0);
|
||||||
|
|
||||||
|
MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
|
||||||
|
ObjectName mxbeanName = new ObjectName(
|
||||||
|
"Hadoop:service=DataNode,name=DataNodeInfo");
|
||||||
|
|
||||||
|
// Verify and wait until one of the BP service actor identifies active namenode as active
|
||||||
|
// and another as standby.
|
||||||
|
cluster.waitDatanodeConnectedToActive(datanode, 5000);
|
||||||
|
|
||||||
|
// Verify that last heartbeat sent to both namenodes in last 5 sec.
|
||||||
|
assertLastHeartbeatSentTime(datanode, "LastHeartbeat");
|
||||||
|
// Verify that last heartbeat response from both namenodes have been received within
|
||||||
|
// last 5 sec.
|
||||||
|
assertLastHeartbeatSentTime(datanode, "LastHeartbeatResponseTime");
|
||||||
|
|
||||||
|
|
||||||
|
NameNode sbNameNode = cluster.getNameNode(1);
|
||||||
|
|
||||||
|
// Stopping standby namenode
|
||||||
|
sbNameNode.stop();
|
||||||
|
|
||||||
|
// Verify that last heartbeat response time from one of the namenodes would stay much higher
|
||||||
|
// after stopping one namenode.
|
||||||
|
GenericTestUtils.waitFor(() -> {
|
||||||
|
List<Map<String, String>> bpServiceActorInfo = datanode.getBPServiceActorInfoMap();
|
||||||
|
Map<String, String> bpServiceActorInfo1 = bpServiceActorInfo.get(0);
|
||||||
|
Map<String, String> bpServiceActorInfo2 = bpServiceActorInfo.get(1);
|
||||||
|
|
||||||
|
long lastHeartbeatResponseTime1 =
|
||||||
|
Long.parseLong(bpServiceActorInfo1.get("LastHeartbeatResponseTime"));
|
||||||
|
long lastHeartbeatResponseTime2 =
|
||||||
|
Long.parseLong(bpServiceActorInfo2.get("LastHeartbeatResponseTime"));
|
||||||
|
|
||||||
|
LOG.info("Last heartbeat response from namenode 1: {}", lastHeartbeatResponseTime1);
|
||||||
|
LOG.info("Last heartbeat response from namenode 2: {}", lastHeartbeatResponseTime2);
|
||||||
|
|
||||||
|
return (lastHeartbeatResponseTime1 < 5L && lastHeartbeatResponseTime2 > 5L) || (
|
||||||
|
lastHeartbeatResponseTime1 > 5L && lastHeartbeatResponseTime2 < 5L);
|
||||||
|
|
||||||
|
}, 200, 15000,
|
||||||
|
"Last heartbeat response should be higher than 5s for at least one namenode");
|
||||||
|
|
||||||
|
// Verify that last heartbeat sent to both namenodes in last 5 sec even though
|
||||||
|
// the last heartbeat received from one of the namenodes is greater than 5 sec ago.
|
||||||
|
assertLastHeartbeatSentTime(datanode, "LastHeartbeat");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void assertLastHeartbeatSentTime(DataNode datanode, String lastHeartbeat) {
|
||||||
|
List<Map<String, String>> bpServiceActorInfo = datanode.getBPServiceActorInfoMap();
|
||||||
|
Map<String, String> bpServiceActorInfo1 = bpServiceActorInfo.get(0);
|
||||||
|
Map<String, String> bpServiceActorInfo2 = bpServiceActorInfo.get(1);
|
||||||
|
|
||||||
|
long lastHeartbeatSent1 =
|
||||||
|
Long.parseLong(bpServiceActorInfo1.get(lastHeartbeat));
|
||||||
|
long lastHeartbeatSent2 =
|
||||||
|
Long.parseLong(bpServiceActorInfo2.get(lastHeartbeat));
|
||||||
|
|
||||||
|
Assert.assertTrue(lastHeartbeat + " for first bp service actor is higher than 5s",
|
||||||
|
lastHeartbeatSent1 < 5L);
|
||||||
|
Assert.assertTrue(lastHeartbeat + " for second bp service actor is higher than 5s",
|
||||||
|
lastHeartbeatSent2 < 5L);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user