YARN-4738. Notify the RM about the status of OPPORTUNISTIC containers (Konstantinos Karanasos via asuresh)
This commit is contained in:
parent
7be53b65f9
commit
f45bc5a83e
@ -47,6 +47,7 @@
|
|||||||
import org.apache.hadoop.util.VersionUtil;
|
import org.apache.hadoop.util.VersionUtil;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.SignalContainerRequest;
|
import org.apache.hadoop.yarn.api.protocolrecords.SignalContainerRequest;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerState;
|
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||||
@ -61,6 +62,7 @@
|
|||||||
import org.apache.hadoop.yarn.factories.RecordFactory;
|
import org.apache.hadoop.yarn.factories.RecordFactory;
|
||||||
import org.apache.hadoop.yarn.factories.impl.pb.RecordFactoryPBImpl;
|
import org.apache.hadoop.yarn.factories.impl.pb.RecordFactoryPBImpl;
|
||||||
import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager;
|
import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager;
|
||||||
|
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||||
import org.apache.hadoop.yarn.server.api.ResourceManagerConstants;
|
import org.apache.hadoop.yarn.server.api.ResourceManagerConstants;
|
||||||
import org.apache.hadoop.yarn.server.api.ResourceTracker;
|
import org.apache.hadoop.yarn.server.api.ResourceTracker;
|
||||||
import org.apache.hadoop.yarn.server.api.ServerRMProxy;
|
import org.apache.hadoop.yarn.server.api.ServerRMProxy;
|
||||||
@ -85,6 +87,7 @@
|
|||||||
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider;
|
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.util.NodeManagerHardwareUtils;
|
import org.apache.hadoop.yarn.server.nodemanager.util.NodeManagerHardwareUtils;
|
||||||
|
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
||||||
import org.apache.hadoop.yarn.util.YarnVersionInfo;
|
import org.apache.hadoop.yarn.util.YarnVersionInfo;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
@ -528,7 +531,12 @@ protected List<ContainerStatus> getContainerStatuses() throws IOException {
|
|||||||
containerStatuses.add(containerStatus);
|
containerStatuses.add(containerStatus);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Account for all containers that got killed while they were still queued.
|
||||||
|
pendingCompletedContainers.putAll(getKilledQueuedContainerStatuses());
|
||||||
|
|
||||||
containerStatuses.addAll(pendingCompletedContainers.values());
|
containerStatuses.addAll(pendingCompletedContainers.values());
|
||||||
|
|
||||||
if (LOG.isDebugEnabled()) {
|
if (LOG.isDebugEnabled()) {
|
||||||
LOG.debug("Sending out " + containerStatuses.size()
|
LOG.debug("Sending out " + containerStatuses.size()
|
||||||
+ " container statuses: " + containerStatuses);
|
+ " container statuses: " + containerStatuses);
|
||||||
@ -536,6 +544,43 @@ protected List<ContainerStatus> getContainerStatuses() throws IOException {
|
|||||||
return containerStatuses;
|
return containerStatuses;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add to the container statuses the status of the containers that got killed
|
||||||
|
* while they were queued.
|
||||||
|
*/
|
||||||
|
private Map<ContainerId, ContainerStatus> getKilledQueuedContainerStatuses() {
|
||||||
|
Map<ContainerId, ContainerStatus> killedQueuedContainerStatuses =
|
||||||
|
new HashMap<>();
|
||||||
|
for (Map.Entry<ContainerTokenIdentifier, String> killedQueuedContainer :
|
||||||
|
this.context.getQueuingContext().
|
||||||
|
getKilledQueuedContainers().entrySet()) {
|
||||||
|
ContainerTokenIdentifier containerTokenId = killedQueuedContainer
|
||||||
|
.getKey();
|
||||||
|
ContainerId containerId = containerTokenId.getContainerID();
|
||||||
|
ContainerStatus containerStatus = BuilderUtils.newContainerStatus(
|
||||||
|
containerId, ContainerState.COMPLETE,
|
||||||
|
killedQueuedContainer.getValue(), ContainerExitStatus.ABORTED,
|
||||||
|
containerTokenId.getResource(), containerTokenId.getExecutionType());
|
||||||
|
ApplicationId applicationId = containerId.getApplicationAttemptId()
|
||||||
|
.getApplicationId();
|
||||||
|
if (isApplicationStopped(applicationId)) {
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug(applicationId + " is completing, " + " remove "
|
||||||
|
+ containerId + " from NM context.");
|
||||||
|
}
|
||||||
|
this.context.getQueuingContext().getKilledQueuedContainers()
|
||||||
|
.remove(containerTokenId);
|
||||||
|
killedQueuedContainerStatuses.put(containerId, containerStatus);
|
||||||
|
} else {
|
||||||
|
if (!isContainerRecentlyStopped(containerId)) {
|
||||||
|
killedQueuedContainerStatuses.put(containerId, containerStatus);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
addCompletedContainer(containerId);
|
||||||
|
}
|
||||||
|
return killedQueuedContainerStatuses;
|
||||||
|
}
|
||||||
|
|
||||||
private List<ApplicationId> getRunningApplications() {
|
private List<ApplicationId> getRunningApplications() {
|
||||||
List<ApplicationId> runningApplications = new ArrayList<ApplicationId>();
|
List<ApplicationId> runningApplications = new ArrayList<ApplicationId>();
|
||||||
runningApplications.addAll(this.context.getApplications().keySet());
|
runningApplications.addAll(this.context.getApplications().keySet());
|
||||||
@ -601,6 +646,7 @@ public void addCompletedContainer(ContainerId containerId) {
|
|||||||
public void removeOrTrackCompletedContainersFromContext(
|
public void removeOrTrackCompletedContainersFromContext(
|
||||||
List<ContainerId> containerIds) throws IOException {
|
List<ContainerId> containerIds) throws IOException {
|
||||||
Set<ContainerId> removedContainers = new HashSet<ContainerId>();
|
Set<ContainerId> removedContainers = new HashSet<ContainerId>();
|
||||||
|
Set<ContainerId> removedNullContainers = new HashSet<ContainerId>();
|
||||||
|
|
||||||
pendingContainersToRemove.addAll(containerIds);
|
pendingContainersToRemove.addAll(containerIds);
|
||||||
Iterator<ContainerId> iter = pendingContainersToRemove.iterator();
|
Iterator<ContainerId> iter = pendingContainersToRemove.iterator();
|
||||||
@ -610,6 +656,7 @@ public void removeOrTrackCompletedContainersFromContext(
|
|||||||
Container nmContainer = context.getContainers().get(containerId);
|
Container nmContainer = context.getContainers().get(containerId);
|
||||||
if (nmContainer == null) {
|
if (nmContainer == null) {
|
||||||
iter.remove();
|
iter.remove();
|
||||||
|
removedNullContainers.add(containerId);
|
||||||
} else if (nmContainer.getContainerState().equals(
|
} else if (nmContainer.getContainerState().equals(
|
||||||
org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState.DONE)) {
|
org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState.DONE)) {
|
||||||
context.getContainers().remove(containerId);
|
context.getContainers().remove(containerId);
|
||||||
@ -618,6 +665,17 @@ public void removeOrTrackCompletedContainersFromContext(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Remove null containers from queuing context for killed queued containers.
|
||||||
|
Iterator<ContainerTokenIdentifier> killedQueuedContIter =
|
||||||
|
context.getQueuingContext().getKilledQueuedContainers().keySet().
|
||||||
|
iterator();
|
||||||
|
while (killedQueuedContIter.hasNext()) {
|
||||||
|
if (removedNullContainers.contains(
|
||||||
|
killedQueuedContIter.next().getContainerID())) {
|
||||||
|
killedQueuedContIter.remove();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!removedContainers.isEmpty()) {
|
if (!removedContainers.isEmpty()) {
|
||||||
LOG.info("Removed completed containers from NM context: "
|
LOG.info("Removed completed containers from NM context: "
|
||||||
+ removedContainers);
|
+ removedContainers);
|
||||||
|
@ -554,7 +554,7 @@ private void shedQueuedOpportunisticContainers() {
|
|||||||
if (containerTokenIdentifier != null) {
|
if (containerTokenIdentifier != null) {
|
||||||
this.context.getQueuingContext().getKilledQueuedContainers()
|
this.context.getQueuingContext().getKilledQueuedContainers()
|
||||||
.putIfAbsent(cInfo.getContainerTokenIdentifier(),
|
.putIfAbsent(cInfo.getContainerTokenIdentifier(),
|
||||||
"Container De-queued to meet global queuing limits. "
|
"Container de-queued to meet NM queuing limits. "
|
||||||
+ "Max Queue length["
|
+ "Max Queue length["
|
||||||
+ this.queuingLimit.getMaxQueueLength() + "]");
|
+ this.queuingLimit.getMaxQueueLength() + "]");
|
||||||
}
|
}
|
||||||
|
@ -65,6 +65,7 @@
|
|||||||
import org.apache.hadoop.yarn.api.protocolrecords.SignalContainerRequest;
|
import org.apache.hadoop.yarn.api.protocolrecords.SignalContainerRequest;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
|
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerState;
|
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||||
@ -1077,6 +1078,126 @@ public org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont
|
|||||||
Assert.assertTrue(containerIdSet.contains(runningContainerId));
|
Assert.assertTrue(containerIdSet.contains(runningContainerId));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(timeout = 90000)
|
||||||
|
public void testKilledQueuedContainers() throws Exception {
|
||||||
|
NodeManager nm = new NodeManager();
|
||||||
|
YarnConfiguration conf = new YarnConfiguration();
|
||||||
|
conf.set(
|
||||||
|
NodeStatusUpdaterImpl
|
||||||
|
.YARN_NODEMANAGER_DURATION_TO_TRACK_STOPPED_CONTAINERS,
|
||||||
|
"10000");
|
||||||
|
nm.init(conf);
|
||||||
|
NodeStatusUpdaterImpl nodeStatusUpdater =
|
||||||
|
(NodeStatusUpdaterImpl) nm.getNodeStatusUpdater();
|
||||||
|
ApplicationId appId = ApplicationId.newInstance(0, 0);
|
||||||
|
ApplicationAttemptId appAttemptId =
|
||||||
|
ApplicationAttemptId.newInstance(appId, 0);
|
||||||
|
|
||||||
|
// Add application to context.
|
||||||
|
nm.getNMContext().getApplications().putIfAbsent(appId,
|
||||||
|
mock(Application.class));
|
||||||
|
|
||||||
|
// Create a running container and add it to the context.
|
||||||
|
ContainerId runningContainerId =
|
||||||
|
ContainerId.newContainerId(appAttemptId, 1);
|
||||||
|
Token runningContainerToken =
|
||||||
|
BuilderUtils.newContainerToken(runningContainerId, "anyHost",
|
||||||
|
1234, "anyUser", BuilderUtils.newResource(1024, 1), 0, 123,
|
||||||
|
"password".getBytes(), 0);
|
||||||
|
Container runningContainer =
|
||||||
|
new ContainerImpl(conf, null, null, null, null,
|
||||||
|
BuilderUtils.newContainerTokenIdentifier(runningContainerToken),
|
||||||
|
nm.getNMContext()) {
|
||||||
|
@Override
|
||||||
|
public ContainerState getCurrentState() {
|
||||||
|
return ContainerState.RUNNING;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public org.apache.hadoop.yarn.server.nodemanager.containermanager.
|
||||||
|
container.ContainerState getContainerState() {
|
||||||
|
return org.apache.hadoop.yarn.server.nodemanager.containermanager.
|
||||||
|
container.ContainerState.RUNNING;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
nm.getNMContext().getContainers()
|
||||||
|
.put(runningContainerId, runningContainer);
|
||||||
|
|
||||||
|
// Create two killed queued containers and add them to the queuing context.
|
||||||
|
ContainerId killedQueuedContainerId1 = ContainerId.newContainerId(
|
||||||
|
appAttemptId, 2);
|
||||||
|
ContainerTokenIdentifier killedQueuedContainerTokenId1 = BuilderUtils
|
||||||
|
.newContainerTokenIdentifier(BuilderUtils.newContainerToken(
|
||||||
|
killedQueuedContainerId1, "anyHost", 1234, "anyUser", BuilderUtils
|
||||||
|
.newResource(1024, 1), 0, 123, "password".getBytes(), 0));
|
||||||
|
ContainerId killedQueuedContainerId2 = ContainerId.newContainerId(
|
||||||
|
appAttemptId, 3);
|
||||||
|
ContainerTokenIdentifier killedQueuedContainerTokenId2 = BuilderUtils
|
||||||
|
.newContainerTokenIdentifier(BuilderUtils.newContainerToken(
|
||||||
|
killedQueuedContainerId2, "anyHost", 1234, "anyUser", BuilderUtils
|
||||||
|
.newResource(1024, 1), 0, 123, "password".getBytes(), 0));
|
||||||
|
|
||||||
|
nm.getNMContext().getQueuingContext().getKilledQueuedContainers().put(
|
||||||
|
killedQueuedContainerTokenId1, "Queued container killed.");
|
||||||
|
nm.getNMContext().getQueuingContext().getKilledQueuedContainers().put(
|
||||||
|
killedQueuedContainerTokenId2, "Queued container killed.");
|
||||||
|
|
||||||
|
List<ContainerStatus> containerStatuses = nodeStatusUpdater
|
||||||
|
.getContainerStatuses();
|
||||||
|
|
||||||
|
Assert.assertEquals(3, containerStatuses.size());
|
||||||
|
|
||||||
|
ContainerStatus runningContainerStatus = null;
|
||||||
|
ContainerStatus killedQueuedContainerStatus1 = null;
|
||||||
|
ContainerStatus killedQueuedContainerStatus2 = null;
|
||||||
|
for (ContainerStatus cStatus : containerStatuses) {
|
||||||
|
if (ContainerState.RUNNING == cStatus.getState()) {
|
||||||
|
runningContainerStatus = cStatus;
|
||||||
|
}
|
||||||
|
if (ContainerState.COMPLETE == cStatus.getState()) {
|
||||||
|
if (killedQueuedContainerId1.equals(cStatus.getContainerId())) {
|
||||||
|
killedQueuedContainerStatus1 = cStatus;
|
||||||
|
} else {
|
||||||
|
killedQueuedContainerStatus2 = cStatus;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check container IDs and Container Status.
|
||||||
|
Assert.assertNotNull(runningContainerId);
|
||||||
|
Assert.assertNotNull(killedQueuedContainerId1);
|
||||||
|
Assert.assertNotNull(killedQueuedContainerId2);
|
||||||
|
|
||||||
|
// Killed queued container should have ABORTED exit status.
|
||||||
|
Assert.assertEquals(ContainerExitStatus.ABORTED,
|
||||||
|
killedQueuedContainerStatus1.getExitStatus());
|
||||||
|
Assert.assertEquals(ContainerExitStatus.ABORTED,
|
||||||
|
killedQueuedContainerStatus2.getExitStatus());
|
||||||
|
|
||||||
|
// Killed queued container should appear in the recentlyStoppedContainers.
|
||||||
|
Assert.assertTrue(nodeStatusUpdater.isContainerRecentlyStopped(
|
||||||
|
killedQueuedContainerId1));
|
||||||
|
Assert.assertTrue(nodeStatusUpdater.isContainerRecentlyStopped(
|
||||||
|
killedQueuedContainerId2));
|
||||||
|
|
||||||
|
// Check if killed queued containers are successfully removed from the
|
||||||
|
// queuing context.
|
||||||
|
List<ContainerId> ackedContainers = new ArrayList<ContainerId>();
|
||||||
|
ackedContainers.add(killedQueuedContainerId1);
|
||||||
|
ackedContainers.add(killedQueuedContainerId2);
|
||||||
|
|
||||||
|
nodeStatusUpdater.removeOrTrackCompletedContainersFromContext(
|
||||||
|
ackedContainers);
|
||||||
|
|
||||||
|
containerStatuses = nodeStatusUpdater.getContainerStatuses();
|
||||||
|
|
||||||
|
// Only the running container should be in the container statuses now.
|
||||||
|
Assert.assertEquals(1, containerStatuses.size());
|
||||||
|
Assert.assertEquals(ContainerState.RUNNING,
|
||||||
|
containerStatuses.get(0).getState());
|
||||||
|
}
|
||||||
|
|
||||||
@Test(timeout = 10000)
|
@Test(timeout = 10000)
|
||||||
public void testCompletedContainersIsRecentlyStopped() throws Exception {
|
public void testCompletedContainersIsRecentlyStopped() throws Exception {
|
||||||
NodeManager nm = new NodeManager();
|
NodeManager nm = new NodeManager();
|
||||||
|
@ -46,6 +46,7 @@
|
|||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerState;
|
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ExecutionType;
|
||||||
import org.apache.hadoop.yarn.api.records.NodeId;
|
import org.apache.hadoop.yarn.api.records.NodeId;
|
||||||
import org.apache.hadoop.yarn.api.records.NodeState;
|
import org.apache.hadoop.yarn.api.records.NodeState;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
@ -1341,21 +1342,28 @@ private void handleContainerStatus(List<ContainerStatus> containerStatuses) {
|
|||||||
|
|
||||||
// Process running containers
|
// Process running containers
|
||||||
if (remoteContainer.getState() == ContainerState.RUNNING) {
|
if (remoteContainer.getState() == ContainerState.RUNNING) {
|
||||||
|
// Process only GUARANTEED containers in the RM.
|
||||||
|
if (remoteContainer.getExecutionType() == ExecutionType.GUARANTEED) {
|
||||||
if (!launchedContainers.contains(containerId)) {
|
if (!launchedContainers.contains(containerId)) {
|
||||||
// Just launched container. RM knows about it the first time.
|
// Just launched container. RM knows about it the first time.
|
||||||
launchedContainers.add(containerId);
|
launchedContainers.add(containerId);
|
||||||
newlyLaunchedContainers.add(remoteContainer);
|
newlyLaunchedContainers.add(remoteContainer);
|
||||||
// Unregister from containerAllocationExpirer.
|
// Unregister from containerAllocationExpirer.
|
||||||
containerAllocationExpirer.unregister(
|
containerAllocationExpirer
|
||||||
new AllocationExpirationInfo(containerId));
|
.unregister(new AllocationExpirationInfo(containerId));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
if (remoteContainer.getExecutionType() == ExecutionType.GUARANTEED) {
|
||||||
// A finished container
|
// A finished container
|
||||||
launchedContainers.remove(containerId);
|
launchedContainers.remove(containerId);
|
||||||
completedContainers.add(remoteContainer);
|
|
||||||
// Unregister from containerAllocationExpirer.
|
// Unregister from containerAllocationExpirer.
|
||||||
containerAllocationExpirer.unregister(
|
containerAllocationExpirer
|
||||||
new AllocationExpirationInfo(containerId));
|
.unregister(new AllocationExpirationInfo(containerId));
|
||||||
|
}
|
||||||
|
// Completed containers should also include the OPPORTUNISTIC containers
|
||||||
|
// so that the AM gets properly notified.
|
||||||
|
completedContainers.add(remoteContainer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (newlyLaunchedContainers.size() != 0 || completedContainers.size() != 0) {
|
if (newlyLaunchedContainers.size() != 0 || completedContainers.size() != 0) {
|
||||||
|
Loading…
Reference in New Issue
Block a user