YARN-101. Fix NodeManager heartbeat processing to not lose track of completed containers in case of dropped heartbeats. Contributed by Xuan Gong.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1464105 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
38678cc3d9
commit
3e9200ddde
@ -183,6 +183,9 @@ Release 2.0.5-beta - UNRELEASED
|
||||
local directory hits unix file count limits and thus prevent job failures.
|
||||
(Omkar Vinit Joshi via vinodkv)
|
||||
|
||||
YARN-101. Fix NodeManager heartbeat processing to not lose track of completed
|
||||
containers in case of dropped heartbeats. (Xuan Gong via vinodkv)
|
||||
|
||||
Release 2.0.4-alpha - UNRELEASED
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
@ -119,6 +119,10 @@ protected DeletionService createDeletionService(ContainerExecutor exec) {
|
||||
return new DeletionService(exec);
|
||||
}
|
||||
|
||||
protected NMContext createNMContext(NMContainerTokenSecretManager containerTokenSecretManager) {
|
||||
return new NMContext(containerTokenSecretManager);
|
||||
}
|
||||
|
||||
protected void doSecureLogin() throws IOException {
|
||||
SecurityUtil.login(getConfig(), YarnConfiguration.NM_KEYTAB,
|
||||
YarnConfiguration.NM_PRINCIPAL);
|
||||
@ -137,7 +141,7 @@ public void init(Configuration conf) {
|
||||
containerTokenSecretManager = new NMContainerTokenSecretManager(conf);
|
||||
}
|
||||
|
||||
this.context = new NMContext(containerTokenSecretManager);
|
||||
this.context = createNMContext(containerTokenSecretManager);
|
||||
|
||||
this.aclsManager = new ApplicationACLsManager(conf);
|
||||
|
||||
|
@ -88,6 +88,10 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
|
||||
private final NodeHealthCheckerService healthChecker;
|
||||
private final NodeManagerMetrics metrics;
|
||||
|
||||
private boolean previousHeartBeatSucceeded;
|
||||
private List<ContainerStatus> previousContainersStatuses =
|
||||
new ArrayList<ContainerStatus>();
|
||||
|
||||
public NodeStatusUpdaterImpl(Context context, Dispatcher dispatcher,
|
||||
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
|
||||
super(NodeStatusUpdaterImpl.class.getName());
|
||||
@ -95,6 +99,7 @@ public NodeStatusUpdaterImpl(Context context, Dispatcher dispatcher,
|
||||
this.context = context;
|
||||
this.dispatcher = dispatcher;
|
||||
this.metrics = metrics;
|
||||
this.previousHeartBeatSucceeded = true;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -314,8 +319,14 @@ private NodeStatus getNodeStatus() {
|
||||
NodeStatus nodeStatus = recordFactory.newRecordInstance(NodeStatus.class);
|
||||
nodeStatus.setNodeId(this.nodeId);
|
||||
|
||||
int numActiveContainers = 0;
|
||||
List<ContainerStatus> containersStatuses = new ArrayList<ContainerStatus>();
|
||||
if(previousHeartBeatSucceeded) {
|
||||
previousContainersStatuses.clear();
|
||||
} else {
|
||||
containersStatuses.addAll(previousContainersStatuses);
|
||||
}
|
||||
|
||||
int numActiveContainers = 0;
|
||||
for (Iterator<Entry<ContainerId, Container>> i =
|
||||
this.context.getContainers().entrySet().iterator(); i.hasNext();) {
|
||||
Entry<ContainerId, Container> e = i.next();
|
||||
@ -330,6 +341,7 @@ private NodeStatus getNodeStatus() {
|
||||
LOG.info("Sending out status for container: " + containerStatus);
|
||||
|
||||
if (containerStatus.getState() == ContainerState.COMPLETE) {
|
||||
previousContainersStatuses.add(containerStatus);
|
||||
// Remove
|
||||
i.remove();
|
||||
|
||||
@ -404,6 +416,7 @@ public void run() {
|
||||
}
|
||||
NodeHeartbeatResponse response =
|
||||
resourceTracker.nodeHeartbeat(request);
|
||||
previousHeartBeatSucceeded = true;
|
||||
//get next heartbeat interval from response
|
||||
nextHeartBeatInterval = response.getNextHeartBeatInterval();
|
||||
// See if the master-key has rolled over
|
||||
@ -449,6 +462,7 @@ public void run() {
|
||||
new CMgrCompletedAppsEvent(appsToCleanup));
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
previousHeartBeatSucceeded = false;
|
||||
// TODO Better error handling. Thread can die with the rest of the
|
||||
// NM still running.
|
||||
LOG.error("Caught exception in status-updater", e);
|
||||
|
@ -19,6 +19,7 @@
|
||||
package org.apache.hadoop.yarn.server.nodemanager;
|
||||
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
@ -29,6 +30,7 @@
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import java.util.concurrent.ConcurrentSkipListMap;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
@ -43,6 +45,7 @@
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||
import org.apache.hadoop.yarn.api.records.NodeId;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
@ -58,11 +61,13 @@
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
|
||||
import org.apache.hadoop.yarn.server.api.records.NodeAction;
|
||||
import org.apache.hadoop.yarn.server.api.records.NodeStatus;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
|
||||
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
|
||||
import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils;
|
||||
import org.apache.hadoop.yarn.service.Service;
|
||||
@ -92,6 +97,8 @@ public class TestNodeStatusUpdater {
|
||||
private final Configuration conf = createNMConfig();
|
||||
private NodeManager nm;
|
||||
protected NodeManager rebootedNodeManager;
|
||||
private boolean containerStatusBackupSuccessfully = true;
|
||||
private List<ContainerStatus> completedContainerStatusList = new ArrayList<ContainerStatus>();
|
||||
|
||||
@After
|
||||
public void tearDown() {
|
||||
@ -237,6 +244,22 @@ protected ResourceTracker getRMClient() {
|
||||
}
|
||||
}
|
||||
|
||||
private class MyNodeStatusUpdater2 extends NodeStatusUpdaterImpl {
|
||||
public ResourceTracker resourceTracker;
|
||||
|
||||
public MyNodeStatusUpdater2(Context context, Dispatcher dispatcher,
|
||||
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
|
||||
super(context, dispatcher, healthChecker, metrics);
|
||||
resourceTracker = new MyResourceTracker4(context);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ResourceTracker getRMClient() {
|
||||
return resourceTracker;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private class MyNodeStatusUpdater3 extends NodeStatusUpdaterImpl {
|
||||
public ResourceTracker resourceTracker;
|
||||
private Context context;
|
||||
@ -384,6 +407,104 @@ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request)
|
||||
}
|
||||
}
|
||||
|
||||
private class MyResourceTracker4 implements ResourceTracker {
|
||||
|
||||
public NodeAction registerNodeAction = NodeAction.NORMAL;
|
||||
public NodeAction heartBeatNodeAction = NodeAction.NORMAL;
|
||||
private Context context;
|
||||
|
||||
public MyResourceTracker4(Context context) {
|
||||
this.context = context;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RegisterNodeManagerResponse registerNodeManager(
|
||||
RegisterNodeManagerRequest request) throws YarnRemoteException {
|
||||
RegisterNodeManagerResponse response = recordFactory
|
||||
.newRecordInstance(RegisterNodeManagerResponse.class);
|
||||
response.setNodeAction(registerNodeAction);
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request)
|
||||
throws YarnRemoteException {
|
||||
try {
|
||||
if (heartBeatID == 0) {
|
||||
Assert.assertEquals(request.getNodeStatus().getContainersStatuses()
|
||||
.size(), 0);
|
||||
Assert.assertEquals(context.getContainers().size(), 0);
|
||||
} else if (heartBeatID == 1) {
|
||||
Assert.assertEquals(request.getNodeStatus().getContainersStatuses()
|
||||
.size(), 5);
|
||||
Assert.assertTrue(request.getNodeStatus().getContainersStatuses()
|
||||
.get(0).getState() == ContainerState.RUNNING
|
||||
&& request.getNodeStatus().getContainersStatuses().get(0)
|
||||
.getContainerId().getId() == 1);
|
||||
Assert.assertTrue(request.getNodeStatus().getContainersStatuses()
|
||||
.get(1).getState() == ContainerState.RUNNING
|
||||
&& request.getNodeStatus().getContainersStatuses().get(1)
|
||||
.getContainerId().getId() == 2);
|
||||
Assert.assertTrue(request.getNodeStatus().getContainersStatuses()
|
||||
.get(2).getState() == ContainerState.COMPLETE
|
||||
&& request.getNodeStatus().getContainersStatuses().get(2)
|
||||
.getContainerId().getId() == 3);
|
||||
Assert.assertTrue(request.getNodeStatus().getContainersStatuses()
|
||||
.get(3).getState() == ContainerState.COMPLETE
|
||||
&& request.getNodeStatus().getContainersStatuses().get(3)
|
||||
.getContainerId().getId() == 4);
|
||||
Assert.assertTrue(request.getNodeStatus().getContainersStatuses()
|
||||
.get(4).getState() == ContainerState.RUNNING
|
||||
&& request.getNodeStatus().getContainersStatuses().get(4)
|
||||
.getContainerId().getId() == 5);
|
||||
throw new YarnException("Lost the heartbeat response");
|
||||
} else if (heartBeatID == 2) {
|
||||
Assert.assertEquals(request.getNodeStatus().getContainersStatuses()
|
||||
.size(), 7);
|
||||
Assert.assertTrue(request.getNodeStatus().getContainersStatuses()
|
||||
.get(0).getState() == ContainerState.COMPLETE
|
||||
&& request.getNodeStatus().getContainersStatuses().get(0)
|
||||
.getContainerId().getId() == 3);
|
||||
Assert.assertTrue(request.getNodeStatus().getContainersStatuses()
|
||||
.get(1).getState() == ContainerState.COMPLETE
|
||||
&& request.getNodeStatus().getContainersStatuses().get(1)
|
||||
.getContainerId().getId() == 4);
|
||||
Assert.assertTrue(request.getNodeStatus().getContainersStatuses()
|
||||
.get(2).getState() == ContainerState.RUNNING
|
||||
&& request.getNodeStatus().getContainersStatuses().get(2)
|
||||
.getContainerId().getId() == 1);
|
||||
Assert.assertTrue(request.getNodeStatus().getContainersStatuses()
|
||||
.get(3).getState() == ContainerState.RUNNING
|
||||
&& request.getNodeStatus().getContainersStatuses().get(3)
|
||||
.getContainerId().getId() == 2);
|
||||
Assert.assertTrue(request.getNodeStatus().getContainersStatuses()
|
||||
.get(4).getState() == ContainerState.RUNNING
|
||||
&& request.getNodeStatus().getContainersStatuses().get(4)
|
||||
.getContainerId().getId() == 5);
|
||||
Assert.assertTrue(request.getNodeStatus().getContainersStatuses()
|
||||
.get(5).getState() == ContainerState.RUNNING
|
||||
&& request.getNodeStatus().getContainersStatuses().get(5)
|
||||
.getContainerId().getId() == 6);
|
||||
Assert.assertTrue(request.getNodeStatus().getContainersStatuses()
|
||||
.get(6).getState() == ContainerState.COMPLETE
|
||||
&& request.getNodeStatus().getContainersStatuses().get(6)
|
||||
.getContainerId().getId() == 7);
|
||||
}
|
||||
} catch (AssertionError error) {
|
||||
LOG.info(error);
|
||||
containerStatusBackupSuccessfully = false;
|
||||
} finally {
|
||||
heartBeatID++;
|
||||
}
|
||||
NodeStatus nodeStatus = request.getNodeStatus();
|
||||
nodeStatus.setResponseId(heartBeatID);
|
||||
NodeHeartbeatResponse nhResponse =
|
||||
YarnServerBuilderUtils.newNodeHeartbeatResponse(heartBeatID,
|
||||
heartBeatNodeAction, null, null, null, 1000L);
|
||||
return nhResponse;
|
||||
}
|
||||
}
|
||||
|
||||
@Before
|
||||
public void clearError() {
|
||||
nmStartError = null;
|
||||
@ -725,6 +846,127 @@ public void testApplicationKeepAlive() throws Exception {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test completed containerStatus get back up when heart beat lost
|
||||
*/
|
||||
@Test(timeout = 20000)
|
||||
public void testCompletedContainerStatusBackup() throws Exception {
|
||||
nm = new NodeManager() {
|
||||
@Override
|
||||
protected NodeStatusUpdater createNodeStatusUpdater(Context context,
|
||||
Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
|
||||
MyNodeStatusUpdater2 myNodeStatusUpdater =
|
||||
new MyNodeStatusUpdater2(context, dispatcher, healthChecker,
|
||||
metrics);
|
||||
return myNodeStatusUpdater;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected NMContext createNMContext(
|
||||
NMContainerTokenSecretManager containerTokenSecretManager) {
|
||||
return new MyNMContext(containerTokenSecretManager);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
YarnConfiguration conf = createNMConfig();
|
||||
nm.init(conf);
|
||||
nm.start();
|
||||
|
||||
int waitCount = 0;
|
||||
while (heartBeatID <= 3 && waitCount++ != 20) {
|
||||
Thread.sleep(500);
|
||||
}
|
||||
if(!containerStatusBackupSuccessfully) {
|
||||
Assert.fail("ContainerStatus Backup failed");
|
||||
}
|
||||
nm.stop();
|
||||
}
|
||||
|
||||
private class MyNMContext extends NMContext {
|
||||
ConcurrentMap<ContainerId, Container> containers =
|
||||
new ConcurrentSkipListMap<ContainerId, Container>();
|
||||
|
||||
public MyNMContext(NMContainerTokenSecretManager
|
||||
containerTokenSecretManager) {
|
||||
super(containerTokenSecretManager);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ConcurrentMap<ContainerId, Container> getContainers() {
|
||||
if (heartBeatID == 0) {
|
||||
return containers;
|
||||
} else if (heartBeatID == 1) {
|
||||
ContainerStatus containerStatus1 =
|
||||
createContainerStatus(1, ContainerState.RUNNING);
|
||||
Container container1 = getMockContainer(containerStatus1);
|
||||
containers.put(containerStatus1.getContainerId(), container1);
|
||||
|
||||
ContainerStatus containerStatus2 =
|
||||
createContainerStatus(2, ContainerState.RUNNING);
|
||||
Container container2 = getMockContainer(containerStatus2);
|
||||
containers.put(containerStatus2.getContainerId(), container2);
|
||||
|
||||
ContainerStatus containerStatus3 =
|
||||
createContainerStatus(3, ContainerState.COMPLETE);
|
||||
Container container3 = getMockContainer(containerStatus3);
|
||||
containers.put(containerStatus3.getContainerId(), container3);
|
||||
completedContainerStatusList.add(containerStatus3);
|
||||
|
||||
ContainerStatus containerStatus4 =
|
||||
createContainerStatus(4, ContainerState.COMPLETE);
|
||||
Container container4 = getMockContainer(containerStatus4);
|
||||
containers.put(containerStatus4.getContainerId(), container4);
|
||||
completedContainerStatusList.add(containerStatus4);
|
||||
|
||||
ContainerStatus containerStatus5 =
|
||||
createContainerStatus(5, ContainerState.RUNNING);
|
||||
Container container5 = getMockContainer(containerStatus5);
|
||||
containers.put(containerStatus5.getContainerId(), container5);
|
||||
|
||||
return containers;
|
||||
} else if (heartBeatID == 2) {
|
||||
ContainerStatus containerStatus6 =
|
||||
createContainerStatus(6, ContainerState.RUNNING);
|
||||
Container container6 = getMockContainer(containerStatus6);
|
||||
containers.put(containerStatus6.getContainerId(), container6);
|
||||
|
||||
ContainerStatus containerStatus7 =
|
||||
createContainerStatus(7, ContainerState.COMPLETE);
|
||||
Container container7 = getMockContainer(containerStatus7);
|
||||
containers.put(containerStatus7.getContainerId(), container7);
|
||||
completedContainerStatusList.add(containerStatus7);
|
||||
|
||||
return containers;
|
||||
} else {
|
||||
containers.clear();
|
||||
|
||||
return containers;
|
||||
}
|
||||
}
|
||||
|
||||
private ContainerStatus createContainerStatus(int id,
|
||||
ContainerState containerState) {
|
||||
ApplicationId applicationId =
|
||||
BuilderUtils.newApplicationId(System.currentTimeMillis(), id);
|
||||
ApplicationAttemptId applicationAttemptId =
|
||||
BuilderUtils.newApplicationAttemptId(applicationId, id);
|
||||
ContainerId contaierId =
|
||||
BuilderUtils.newContainerId(applicationAttemptId, id);
|
||||
ContainerStatus containerStatus =
|
||||
BuilderUtils.newContainerStatus(contaierId, containerState,
|
||||
"test_containerStatus: id=" + id + ", containerState: "
|
||||
+ containerState, 0);
|
||||
return containerStatus;
|
||||
}
|
||||
|
||||
private Container getMockContainer(ContainerStatus containerStatus) {
|
||||
Container container = mock(Container.class);
|
||||
when(container.cloneAndGetContainerStatus()).thenReturn(containerStatus);
|
||||
return container;
|
||||
}
|
||||
}
|
||||
|
||||
private void verifyNodeStartFailure(String errMessage) {
|
||||
YarnConfiguration conf = createNMConfig();
|
||||
nm.init(conf);
|
||||
|
Loading…
Reference in New Issue
Block a user