YARN-3445. Cache runningApps in RMNode for getting running apps on given NodeId. (Junping Du via mingma)

This commit is contained in:
Ming Ma 2015-07-10 08:30:10 -07:00
parent b48908033f
commit 08244264c0
7 changed files with 91 additions and 11 deletions

View File

@ -62,7 +62,8 @@ private static class FakeRMNodeImpl implements RMNode {
private NodeState state;
private List<ContainerId> toCleanUpContainers;
private List<ApplicationId> toCleanUpApplications;
private List<ApplicationId> runningApplications;
public FakeRMNodeImpl(NodeId nodeId, String nodeAddr, String httpAddress,
Resource perNode, String rackName, String healthReport,
int cmdPort, String hostName, NodeState state) {
@ -77,6 +78,7 @@ public FakeRMNodeImpl(NodeId nodeId, String nodeAddr, String httpAddress,
this.state = state;
toCleanUpApplications = new ArrayList<ApplicationId>();
toCleanUpContainers = new ArrayList<ContainerId>();
runningApplications = new ArrayList<ApplicationId>();
}
public NodeId getNodeID() {
@ -135,6 +137,10 @@ public List<ApplicationId> getAppsToCleanup() {
return toCleanUpApplications;
}
public List<ApplicationId> getRunningApps() {
return runningApplications;
}
public void updateNodeHeartbeatResponseForCleanup(
NodeHeartbeatResponse response) {
}

View File

@ -118,6 +118,11 @@ public List<ApplicationId> getAppsToCleanup() {
return node.getAppsToCleanup();
}
@Override
public List<ApplicationId> getRunningApps() {
return node.getRunningApps();
}
@Override
public void updateNodeHeartbeatResponseForCleanup(
NodeHeartbeatResponse nodeHeartbeatResponse) {

View File

@ -1678,6 +1678,9 @@ Release 2.6.0 - 2014-11-18
YARN-2811. In Fair Scheduler, reservation fulfillments shouldn't ignore max
share (Siqi Li via Sandy Ryza)
YARN-3445. Cache runningApps in RMNode for getting running apps on given
NodeId. (Junping Du via mingma)
IMPROVEMENTS
YARN-2197. Add a link to YARN CHANGES.txt in the left side of doc

View File

@ -119,6 +119,8 @@ public interface RMNode {
public List<ApplicationId> getAppsToCleanup();
List<ApplicationId> getRunningApps();
/**
* Update a {@link NodeHeartbeatResponse} with the list of containers and
* applications to clean up for this node.

View File

@ -123,11 +123,16 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
new HashSet<ContainerId>();
/* the list of applications that have finished and need to be purged */
private final List<ApplicationId> finishedApplications = new ArrayList<ApplicationId>();
private final List<ApplicationId> finishedApplications =
new ArrayList<ApplicationId>();
/* the list of applications that are running on this node */
private final List<ApplicationId> runningApplications =
new ArrayList<ApplicationId>();
private NodeHeartbeatResponse latestNodeHeartBeatResponse = recordFactory
.newRecordInstance(NodeHeartbeatResponse.class);
private static final StateMachineFactory<RMNodeImpl,
NodeState,
RMNodeEventType,
@ -136,7 +141,7 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
NodeState,
RMNodeEventType,
RMNodeEvent>(NodeState.NEW)
//Transitions from NEW state
.addTransition(NodeState.NEW, NodeState.RUNNING,
RMNodeEventType.STARTED, new AddNodeTransition())
@ -382,6 +387,16 @@ public List<ApplicationId> getAppsToCleanup() {
}
@Override
public List<ApplicationId> getRunningApps() {
this.readLock.lock();
try {
return new ArrayList<ApplicationId>(this.runningApplications);
} finally {
this.readLock.unlock();
}
}
@Override
public List<ContainerId> getContainersToCleanUp() {
@ -519,9 +534,12 @@ private static void handleRunningAppOnNode(RMNodeImpl rmNode,
LOG.warn("Cannot get RMApp by appId=" + appId
+ ", just added it to finishedApplications list for cleanup");
rmNode.finishedApplications.add(appId);
rmNode.runningApplications.remove(appId);
return;
}
// Add running applications back due to Node add or Node reconnection.
rmNode.runningApplications.add(appId);
context.getDispatcher().getEventHandler()
.handle(new RMAppRunningOnNodeEvent(appId, nodeId));
}
@ -707,8 +725,9 @@ public static class CleanUpAppTransition
@Override
public void transition(RMNodeImpl rmNode, RMNodeEvent event) {
rmNode.finishedApplications.add(((
RMNodeCleanAppEvent) event).getAppId());
ApplicationId appId = ((RMNodeCleanAppEvent) event).getAppId();
rmNode.finishedApplications.add(appId);
rmNode.runningApplications.remove(appId);
}
}
@ -910,12 +929,22 @@ private void handleContainerStatus(List<ContainerStatus> containerStatuses) {
+ "cleanup, no further processing");
continue;
}
if (finishedApplications.contains(containerId.getApplicationAttemptId()
.getApplicationId())) {
ApplicationId containerAppId =
containerId.getApplicationAttemptId().getApplicationId();
if (finishedApplications.contains(containerAppId)) {
LOG.info("Container " + containerId
+ " belongs to an application that is already killed,"
+ " no further processing");
continue;
} else if (!runningApplications.contains(containerAppId)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Container " + containerId
+ " is the first container get launched for application "
+ containerAppId);
}
runningApplications.add(containerAppId);
}
// Process running containers

View File

@ -186,6 +186,11 @@ public List<ApplicationId> getAppsToCleanup() {
return null;
}
@Override
public List<ApplicationId> getRunningApps() {
return null;
}
@Override
public void updateNodeHeartbeatResponseForCleanup(NodeHeartbeatResponse response) {
}

View File

@ -33,6 +33,7 @@
import org.apache.hadoop.util.HostsFileReader;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.NodeState;
@ -485,9 +486,9 @@ public void testUpdateHeartbeatResponseForCleanup() {
NodeId nodeId = node.getNodeID();
// Expire a container
ContainerId completedContainerId = BuilderUtils.newContainerId(
BuilderUtils.newApplicationAttemptId(
BuilderUtils.newApplicationId(0, 0), 0), 0);
ContainerId completedContainerId = BuilderUtils.newContainerId(
BuilderUtils.newApplicationAttemptId(
BuilderUtils.newApplicationId(0, 0), 0), 0);
node.handle(new RMNodeCleanContainerEvent(nodeId, completedContainerId));
Assert.assertEquals(1, node.getContainersToCleanUp().size());
@ -512,6 +513,35 @@ public void testUpdateHeartbeatResponseForCleanup() {
Assert.assertEquals(finishedAppId, hbrsp.getApplicationsToCleanup().get(0));
}
@Test(timeout=20000)
public void testUpdateHeartbeatResponseForAppLifeCycle() {
RMNodeImpl node = getRunningNode();
NodeId nodeId = node.getNodeID();
ApplicationId runningAppId = BuilderUtils.newApplicationId(0, 1);
// Create a running container
ContainerId runningContainerId = BuilderUtils.newContainerId(
BuilderUtils.newApplicationAttemptId(
runningAppId, 0), 0);
ContainerStatus status = ContainerStatus.newInstance(runningContainerId,
ContainerState.RUNNING, "", 0);
List<ContainerStatus> statusList = new ArrayList<ContainerStatus>();
statusList.add(status);
NodeHealthStatus nodeHealth = NodeHealthStatus.newInstance(true,
"", System.currentTimeMillis());
node.handle(new RMNodeStatusEvent(nodeId, nodeHealth,
statusList, null, null));
Assert.assertEquals(1, node.getRunningApps().size());
// Finish an application
ApplicationId finishedAppId = runningAppId;
node.handle(new RMNodeCleanAppEvent(nodeId, finishedAppId));
Assert.assertEquals(1, node.getAppsToCleanup().size());
Assert.assertEquals(0, node.getRunningApps().size());
}
private RMNodeImpl getRunningNode() {
return getRunningNode(null, 0);
}