From feaf0349949e831ce3f25814c1bbff52f17bfe8f Mon Sep 17 00:00:00 2001 From: Rohith Sharma K S Date: Mon, 24 Aug 2015 11:25:07 +0530 Subject: [PATCH] YARN-3896. RMNode transitioned from RUNNING to REBOOTED because its response id has not been reset synchronously. (Jun Gong via rohithsharmaks) --- .../hadoop/yarn/sls/nodemanager/NodeInfo.java | 3 ++ .../yarn/sls/scheduler/RMNodeWrapper.java | 5 +++ hadoop-yarn-project/CHANGES.txt | 3 ++ .../ResourceTrackerService.java | 2 + .../server/resourcemanager/rmnode/RMNode.java | 7 +++- .../resourcemanager/rmnode/RMNodeImpl.java | 15 ++++--- .../server/resourcemanager/MockNodes.java | 4 ++ .../resourcetracker/TestNMReconnect.java | 39 +++++++++++++++++++ 8 files changed, 72 insertions(+), 6 deletions(-) diff --git a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java index 440779c3ab..2d2c3e03cf 100644 --- a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java +++ b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java @@ -149,6 +149,9 @@ public NodeHeartbeatResponse getLastNodeHeartBeatResponse() { return null; } + public void resetLastNodeHeartBeatResponse() { + } + public List pullContainerUpdates() { ArrayList list = new ArrayList(); diff --git a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java index a6633aea9a..ecc4734986 100644 --- a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java +++ b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java @@ -134,6 +134,11 @@ public NodeHeartbeatResponse getLastNodeHeartBeatResponse() { return node.getLastNodeHeartBeatResponse(); } + @Override + public void resetLastNodeHeartBeatResponse() { + node.getLastNodeHeartBeatResponse().setResponseId(0); + } + @Override @SuppressWarnings("unchecked") public List pullContainerUpdates() { diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 5904a313e9..bf58c96886 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -792,6 +792,9 @@ Release 2.8.0 - UNRELEASED YARN-3986. getTransferredContainers in AbstractYarnScheduler should be present in YarnScheduler interface instead. (Varun Saxena via rohithsharmaks) + YARN-3896. RMNode transitioned from RUNNING to REBOOTED because its response id + has not been reset synchronously. (Jun Gong via rohithsharmaks) + Release 2.7.2 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java index 3c2c09b8c1..100e991621 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java @@ -325,6 +325,8 @@ public RegisterNodeManagerResponse registerNodeManager( } else { LOG.info("Reconnect from the node at: " + host); this.nmLivelinessMonitor.unregister(nodeId); + // Reset heartbeat ID since node just restarted. + oldNode.resetLastNodeHeartBeatResponse(); this.rmContext .getDispatcher() .getEventHandler() diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java index 0386be6ce0..00cd3b6db4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java @@ -129,7 +129,12 @@ public interface RMNode { public void updateNodeHeartbeatResponseForCleanup(NodeHeartbeatResponse response); public NodeHeartbeatResponse getLastNodeHeartBeatResponse(); - + + /** + * Reset lastNodeHeartbeatResponse's ID to 0. + */ + void resetLastNodeHeartBeatResponse(); + /** * Get and clear the list of containerUpdates accumulated across NM * heartbeats. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java index f182d02e6f..7a1ba74191 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java @@ -443,6 +443,16 @@ public NodeHeartbeatResponse getLastNodeHeartBeatResponse() { } } + @Override + public void resetLastNodeHeartBeatResponse() { + this.writeLock.lock(); + try { + latestNodeHeartBeatResponse.setResponseId(0); + } finally { + this.writeLock.unlock(); + } + } + public void handle(RMNodeEvent event) { LOG.debug("Processing " + event.getNodeId() + " of type " + event.getType()); try { @@ -617,8 +627,6 @@ public void transition(RMNodeImpl rmNode, RMNodeEvent event) { new NodeRemovedSchedulerEvent(rmNode)); if (rmNode.getHttpPort() == newNode.getHttpPort()) { - // Reset heartbeat ID since node just restarted. - rmNode.getLastNodeHeartBeatResponse().setResponseId(0); if (!rmNode.getTotalCapability().equals( newNode.getTotalCapability())) { rmNode.totalCapability = newNode.getTotalCapability(); @@ -656,9 +664,6 @@ public void transition(RMNodeImpl rmNode, RMNodeEvent event) { handleNMContainerStatus(reconnectEvent.getNMContainerStatuses(), rmNode); - // Reset heartbeat ID since node just restarted. - rmNode.getLastNodeHeartBeatResponse().setResponseId(0); - for (ApplicationId appId : reconnectEvent.getRunningApplications()) { handleRunningAppOnNode(rmNode, rmNode.context, appId, rmNode.nodeId); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java index 095fe28e1e..53cb8d097c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java @@ -200,6 +200,10 @@ public NodeHeartbeatResponse getLastNodeHeartBeatResponse() { return null; } + @Override + public void resetLastNodeHeartBeatResponse() { + } + @Override public String getNodeManagerVersion() { return null; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java index b525efced6..dce3d06c26 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java @@ -21,6 +21,11 @@ import java.util.ArrayList; import java.util.List; +import org.apache.hadoop.yarn.api.records.NodeState; +import org.apache.hadoop.yarn.event.DrainDispatcher; +import org.apache.hadoop.yarn.server.resourcemanager.MockNM; +import org.apache.hadoop.yarn.server.resourcemanager.MockRM; +import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.junit.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.api.records.NodeId; @@ -189,4 +194,38 @@ public void testCompareRMNodeAfterReconnect() throws Exception { nlm.stop(); scheduler.stop(); } + + @Test(timeout = 10000) + public void testRMNodeStatusAfterReconnect() throws Exception { + // The node(127.0.0.1:1234) reconnected with RM. When it registered with + // RM, RM set its lastNodeHeartbeatResponse's id to 0 asynchronously. But + // the node's heartbeat come before RM succeeded setting the id to 0. + final DrainDispatcher dispatcher = new DrainDispatcher(); + MockRM rm = new MockRM(){ + @Override + protected Dispatcher createDispatcher() { + return dispatcher; + } + }; + rm.start(); + MockNM nm1 = + new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService()); + nm1.registerNode(); + int i = 0; + while(i < 3) { + nm1.nodeHeartbeat(true); + dispatcher.await(); + i++; + } + + MockNM nm2 = + new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService()); + nm2.registerNode(); + RMNode rmNode = rm.getRMContext().getRMNodes().get(nm2.getNodeId()); + nm2.nodeHeartbeat(true); + dispatcher.await(); + Assert.assertEquals("Node is Not in Running state.", NodeState.RUNNING, + rmNode.getState()); + rm.stop(); + } }