YARN-7371. Added allocateRequestId in NMContainerStatus for recovery. Contributed by Chandni Singh

This commit is contained in:
Jian He 2017-11-04 23:15:21 -07:00
parent a127f7b0fb
commit a55d0738f1
16 changed files with 177 additions and 61 deletions

View File

@ -131,12 +131,16 @@ protected FileSystem getFS() {
return fs;
}
protected MiniYARNCluster getYarnCluster() {
return yarnCluster;
}
protected void setupInternal(int numNodeManager)
throws Exception {
LOG.info("Starting up YARN cluster");
// Logger rootLogger = LogManager.getRootLogger();
// rootLogger.setLevel(Level.DEBUG);
if (conf == null) {
setConf(new YarnConfiguration());
}
conf.setInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, 128);
// reduce the teardown waiting time
conf.setLong(YarnConfiguration.DISPATCHER_DRAIN_EVENTS_TIMEOUT, 1000);

View File

@ -18,12 +18,13 @@
package org.apache.hadoop.yarn.service;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.*;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.service.api.records.Service;
import org.apache.hadoop.yarn.service.api.records.Component;
@ -41,17 +42,12 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import java.util.*;
import java.util.concurrent.TimeoutException;
import static org.apache.hadoop.yarn.api.records.YarnApplicationState.FINISHED;
import static org.apache.hadoop.yarn.conf.YarnConfiguration.*;
/**
* End to end tests to test deploying services with MiniYarnCluster and a in-JVM
@ -67,7 +63,8 @@ public class TestYarnNativeServices extends ServiceTestUtils {
@Before
public void setup() throws Exception {
setupInternal(NUM_NMS);
File tmpYarnDir = new File("target", "tmp");
FileUtils.deleteQuietly(tmpYarnDir);
}
@After
@ -84,6 +81,7 @@ public void tearDown() throws IOException {
// 6. Destroy the service
@Test (timeout = 200000)
public void testCreateFlexStopDestroyService() throws Exception {
setupInternal(NUM_NMS);
ServiceClient client = createClient();
Service exampleApp = createExampleApplication();
client.actionCreate(exampleApp);
@ -135,6 +133,7 @@ public void testCreateFlexStopDestroyService() throws Exception {
// Check containers for compa started before containers for compb
@Test (timeout = 200000)
public void testComponentStartOrder() throws Exception {
setupInternal(NUM_NMS);
ServiceClient client = createClient();
Service exampleApp = new Service();
exampleApp.setName("teststartorder");
@ -155,6 +154,70 @@ public void testComponentStartOrder() throws Exception {
client.actionDestroy(exampleApp.getName());
}
// Test to verify recovery of SeviceMaster after RM is restarted.
// 1. Create an example service.
// 2. Restart RM.
// 3. Fail the application attempt.
// 4. Verify ServiceMaster recovers.
@Test(timeout = 200000)
public void testRecoverComponentsAfterRMRestart() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
conf.setBoolean(
YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, true);
conf.setLong(YarnConfiguration.NM_RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
500L);
conf.setBoolean(YarnConfiguration.YARN_MINICLUSTER_FIXED_PORTS, true);
conf.setBoolean(YarnConfiguration.YARN_MINICLUSTER_USE_RPC, true);
setConf(conf);
setupInternal(NUM_NMS);
ServiceClient client = createClient();
Service exampleApp = createExampleApplication();
client.actionCreate(exampleApp);
waitForAllCompToBeReady(client, exampleApp);
LOG.info("Restart the resource manager");
getYarnCluster().restartResourceManager(
getYarnCluster().getActiveRMIndex());
GenericTestUtils.waitFor(() ->
getYarnCluster().getResourceManager().getServiceState() ==
org.apache.hadoop.service.Service.STATE.STARTED, 2000, 200000);
Assert.assertTrue("node managers connected",
getYarnCluster().waitForNodeManagersToConnect(5000));
ApplicationId exampleAppId = ApplicationId.fromString(exampleApp.getId());
ApplicationAttemptId applicationAttemptId = client.getYarnClient()
.getApplicationReport(exampleAppId).getCurrentApplicationAttemptId();
Multimap<String, String> containersBeforeFailure = getContainersForAllComp(
client, exampleApp);
LOG.info("Fail the application attempt {}", applicationAttemptId);
client.getYarnClient().failApplicationAttempt(applicationAttemptId);
//wait until attempt 2 is running
GenericTestUtils.waitFor(() -> {
try {
ApplicationReport ar = client.getYarnClient()
.getApplicationReport(exampleAppId);
return ar.getCurrentApplicationAttemptId().getAttemptId() == 2 &&
ar.getYarnApplicationState() == YarnApplicationState.RUNNING;
} catch (YarnException | IOException e) {
throw new RuntimeException("while waiting", e);
}
}, 2000, 200000);
Multimap<String, String> containersAfterFailure = getContainersForAllComp(
client, exampleApp);
Assert.assertEquals("component container affected by restart",
containersBeforeFailure, containersAfterFailure);
LOG.info("Stop/destroy service {}", exampleApp);
client.actionStop(exampleApp.getName(), true);
client.actionDestroy(exampleApp.getName());
}
// Check containers launched are in dependency order
// Get all containers into a list and sort based on container launch time e.g.
// compa-c1, compa-c2, compb-c1, compb-c2;
@ -297,6 +360,24 @@ private void waitForAllCompToBeReady(ServiceClient client,
}, 2000, 200000);
}
/**
* Get all containers of a service.
*/
private Multimap<String, String> getContainersForAllComp(ServiceClient client,
Service example) throws IOException, YarnException {
Multimap<String, String> allContainers = HashMultimap.create();
Service retrievedApp = client.getStatus(example.getName());
retrievedApp.getComponents().forEach(component -> {
if (component.getContainers() != null) {
component.getContainers().forEach(container -> {
allContainers.put(component.getName(), container.getId());
});
}
});
return allContainers;
}
private ServiceClient createClient() throws Exception {
ServiceClient client = new ServiceClient() {
@Override protected Path addJarResource(String appName,

View File

@ -115,7 +115,7 @@ public ContainerTokenIdentifier(ContainerId containerID, String hostName,
this(containerID, 0, hostName, appSubmitter, r, expiryTimeStamp,
masterKeyId, rmIdentifier, priority, creationTime,
logAggregationContext, nodeLabelExpression, containerType,
ExecutionType.GUARANTEED);
ExecutionType.GUARANTEED, -1);
}
public ContainerTokenIdentifier(ContainerId containerID, int containerVersion,
@ -123,6 +123,19 @@ public ContainerTokenIdentifier(ContainerId containerID, int containerVersion,
int masterKeyId, long rmIdentifier, Priority priority, long creationTime,
LogAggregationContext logAggregationContext, String nodeLabelExpression,
ContainerType containerType, ExecutionType executionType) {
this(containerID, containerVersion, hostName, appSubmitter, r,
expiryTimeStamp, masterKeyId, rmIdentifier, priority, creationTime,
logAggregationContext, nodeLabelExpression, containerType,
executionType, -1);
}
public ContainerTokenIdentifier(ContainerId containerID, int containerVersion,
String hostName, String appSubmitter, Resource r, long expiryTimeStamp,
int masterKeyId, long rmIdentifier, Priority priority, long creationTime,
LogAggregationContext logAggregationContext, String nodeLabelExpression,
ContainerType containerType, ExecutionType executionType,
long allocationRequestId) {
ContainerTokenIdentifierProto.Builder builder =
ContainerTokenIdentifierProto.newBuilder();
if (containerID != null) {
@ -152,6 +165,7 @@ public ContainerTokenIdentifier(ContainerId containerID, int containerVersion,
}
builder.setContainerType(convertToProtoFormat(containerType));
builder.setExecutionType(convertToProtoFormat(executionType));
builder.setAllocationRequestId(allocationRequestId);
proto = builder.build();
}
@ -243,6 +257,10 @@ public LogAggregationContext getLogAggregationContext() {
return new LogAggregationContextPBImpl(proto.getLogAggregationContext());
}
public long getAllocationRequestId() {
return proto.getAllocationRequestId();
}
@Override
public void write(DataOutput out) throws IOException {
LOG.debug("Writing ContainerTokenIdentifier to RPC layer: " + this);

View File

@ -53,6 +53,7 @@ message ContainerTokenIdentifierProto {
optional ContainerTypeProto containerType = 12;
optional ExecutionTypeProto executionType = 13 [default = GUARANTEED];
optional int32 version = 14 [default = 0];
optional int64 allocation_request_id = 15 [default = -1];
}
message ClientToAMTokenIdentifierProto {

View File

@ -41,14 +41,14 @@ public static NMContainerStatus newInstance(ContainerId containerId,
long creationTime) {
return newInstance(containerId, version, containerState, allocatedResource,
diagnostics, containerExitStatus, priority, creationTime,
CommonNodeLabelsManager.NO_LABEL, ExecutionType.GUARANTEED);
CommonNodeLabelsManager.NO_LABEL, ExecutionType.GUARANTEED, -1);
}
public static NMContainerStatus newInstance(ContainerId containerId,
int version, ContainerState containerState, Resource allocatedResource,
String diagnostics, int containerExitStatus, Priority priority,
long creationTime, String nodeLabelExpression,
ExecutionType executionType) {
ExecutionType executionType, long allocationRequestId) {
NMContainerStatus status =
Records.newRecord(NMContainerStatus.class);
status.setContainerId(containerId);
@ -61,6 +61,7 @@ public static NMContainerStatus newInstance(ContainerId containerId,
status.setCreationTime(creationTime);
status.setNodeLabelExpression(nodeLabelExpression);
status.setExecutionType(executionType);
status.setAllocationRequestId(allocationRequestId);
return status;
}
@ -130,6 +131,19 @@ public static NMContainerStatus newInstance(ContainerId containerId,
public abstract void setNodeLabelExpression(
String nodeLabelExpression);
/**
* @return the <em>ID</em> corresponding to the original allocation request.
*/
public abstract long getAllocationRequestId();
/**
* Set the <em>ID</em> corresponding to the original allocation request.
*
* @param allocationRequestId the <em>ID</em> corresponding to the original
* allocation request.
*/
public abstract void setAllocationRequestId(long allocationRequestId);
public int getVersion() {
return 0;
}

View File

@ -91,7 +91,8 @@ public String toString() {
.append("Diagnostics: ").append(getDiagnostics()).append(", ")
.append("ExitStatus: ").append(getContainerExitStatus()).append(", ")
.append("NodeLabelExpression: ").append(getNodeLabelExpression())
.append("Priority: ").append(getPriority())
.append("Priority: ").append(getPriority()).append(", ")
.append("AllocationRequestId: ").append(getAllocationRequestId())
.append("]");
return sb.toString();
}
@ -270,6 +271,18 @@ public synchronized void setExecutionType(ExecutionType executionType) {
builder.setExecutionType(convertToProtoFormat(executionType));
}
@Override
public long getAllocationRequestId() {
NMContainerStatusProtoOrBuilder p = viaProto ? proto : builder;
return (p.getAllocationRequestId());
}
@Override
public void setAllocationRequestId(long allocationRequestId) {
maybeInitBuilder();
builder.setAllocationRequestId(allocationRequestId);
}
private void mergeLocalToBuilder() {
if (this.containerId != null
&& !((ContainerIdPBImpl) containerId).getProto().equals(

View File

@ -564,7 +564,7 @@ private Container createContainer(long rmIdentifier, long tokenExpiry,
tokenSecretManager.getCurrentKey().getKeyId(), rmIdentifier,
schedulerKey.getPriority(), currTime,
null, CommonNodeLabelsManager.NO_LABEL, ContainerType.TASK,
ExecutionType.OPPORTUNISTIC);
ExecutionType.OPPORTUNISTIC, schedulerKey.getAllocationRequestId());
byte[] pwd =
tokenSecretManager.createPassword(containerTokenIdentifier);
Token containerToken = newContainerToken(node.getNodeId(), pwd,

View File

@ -176,6 +176,7 @@ message NMContainerStatusProto {
optional string nodeLabelExpression = 8;
optional int32 version = 9;
optional ExecutionTypeProto executionType = 10 [default = GUARANTEED];
optional int64 allocation_request_id = 11 [default = -1];
}
message SCMUploaderNotifyRequestProto {

View File

@ -437,7 +437,7 @@ private void recoverContainer(RecoveredContainerState rcs)
throws IOException {
StartContainerRequest req = rcs.getStartRequest();
ContainerLaunchContext launchContext = req.getContainerLaunchContext();
ContainerTokenIdentifier token = null;
ContainerTokenIdentifier token;
if(rcs.getCapability() != null) {
ContainerTokenIdentifier originalToken =
BuilderUtils.newContainerTokenIdentifier(req.getContainerToken());
@ -449,7 +449,8 @@ private void recoverContainer(RecoveredContainerState rcs)
originalToken.getCreationTime(),
originalToken.getLogAggregationContext(),
originalToken.getNodeLabelExpression(),
originalToken.getContainerType(), originalToken.getExecutionType());
originalToken.getContainerType(), originalToken.getExecutionType(),
originalToken.getAllocationRequestId());
} else {
token = BuilderUtils.newContainerTokenIdentifier(req.getContainerToken());

View File

@ -844,12 +844,14 @@ public ContainerStatus cloneAndGetContainerStatus() {
public NMContainerStatus getNMContainerStatus() {
this.readLock.lock();
try {
return NMContainerStatus.newInstance(this.containerId, this.version,
getCurrentState(), getResource(), diagnostics.toString(), exitCode,
return NMContainerStatus.newInstance(this.containerId,
this.version, getCurrentState(), getResource(),
diagnostics.toString(), exitCode,
containerTokenIdentifier.getPriority(),
containerTokenIdentifier.getCreationTime(),
containerTokenIdentifier.getNodeLabelExpression(),
containerTokenIdentifier.getExecutionType());
containerTokenIdentifier.getExecutionType(),
containerTokenIdentifier.getAllocationRequestId());
} finally {
this.readLock.unlock();
}

View File

@ -571,6 +571,7 @@ private RMContainer recoverAndCreateContainer(NMContainerStatus status,
status.getPriority(), null);
container.setVersion(status.getVersion());
container.setExecutionType(status.getExecutionType());
container.setAllocationRequestId(status.getAllocationRequestId());
ApplicationAttemptId attemptId =
container.getId().getApplicationAttemptId();
RMContainer rmContainer = new RMContainerImpl(container,

View File

@ -655,7 +655,8 @@ private Container updateContainerAndNMToken(RMContainer rmContainer,
container.getNodeId(), getUser(), container.getResource(),
container.getPriority(), rmContainer.getCreationTime(),
this.logAggregationContext, rmContainer.getNodeLabelExpression(),
containerType, container.getExecutionType()));
containerType, container.getExecutionType(),
container.getAllocationRequestId()));
updateNMToken(container);
} catch (IllegalArgumentException e) {
// DNS might be down, skip returning this container.

View File

@ -167,7 +167,7 @@ public void run() {
}
/**
* Helper function for creating ContainerTokens
* Helper function for creating ContainerTokens.
*
* @param containerId Container Id
* @param containerVersion Container Version
@ -183,37 +183,13 @@ public Token createContainerToken(ContainerId containerId,
Resource capability, Priority priority, long createTime) {
return createContainerToken(containerId, containerVersion, nodeId,
appSubmitter, capability, priority, createTime,
null, null, ContainerType.TASK);
null, null, ContainerType.TASK,
ExecutionType.GUARANTEED, -1);
}
/**
* Helper function for creating ContainerTokens.
*
* @param containerId containerId.
* @param containerVersion containerVersion.
* @param nodeId nodeId.
* @param appSubmitter appSubmitter.
* @param capability capability.
* @param priority priority.
* @param createTime createTime.
* @param logAggregationContext logAggregationContext.
* @param nodeLabelExpression nodeLabelExpression.
* @param containerType containerType.
* @return the container-token.
*/
public Token createContainerToken(ContainerId containerId,
int containerVersion, NodeId nodeId, String appSubmitter,
Resource capability, Priority priority, long createTime,
LogAggregationContext logAggregationContext, String nodeLabelExpression,
ContainerType containerType) {
return createContainerToken(containerId, containerVersion, nodeId,
appSubmitter, capability, priority, createTime, null, null,
ContainerType.TASK, ExecutionType.GUARANTEED);
}
/**
* Helper function for creating ContainerTokens
*
* @param containerId Container Id
* @param containerVersion Container version
* @param nodeId Node Id
@ -225,13 +201,15 @@ public Token createContainerToken(ContainerId containerId,
* @param nodeLabelExpression Node Label Expression
* @param containerType Container Type
* @param execType Execution Type
* @param allocationRequestId allocationRequestId
* @return the container-token
*/
public Token createContainerToken(ContainerId containerId,
int containerVersion, NodeId nodeId, String appSubmitter,
Resource capability, Priority priority, long createTime,
LogAggregationContext logAggregationContext, String nodeLabelExpression,
ContainerType containerType, ExecutionType execType) {
ContainerType containerType, ExecutionType execType,
long allocationRequestId) {
byte[] password;
ContainerTokenIdentifier tokenIdentifier;
long expiryTimeStamp =
@ -246,7 +224,7 @@ public Token createContainerToken(ContainerId containerId,
this.currentMasterKey.getMasterKey().getKeyId(),
ResourceManager.getClusterTimeStamp(), priority, createTime,
logAggregationContext, nodeLabelExpression, containerType,
execType);
execType, allocationRequestId);
password = this.createPassword(tokenIdentifier);
} finally {

View File

@ -2113,7 +2113,7 @@ public static NMContainerStatus createNMContainerStatus(
NMContainerStatus.newInstance(containerId, 0, containerState,
Resource.newInstance(1024, 1), "recover container", 0,
Priority.newInstance(0), 0, nodeLabelExpression,
ExecutionType.GUARANTEED);
ExecutionType.GUARANTEED, -1);
return containerReport;
}

View File

@ -2088,17 +2088,17 @@ protected Dispatcher createDispatcher() {
NMContainerStatus.newInstance(c1, 1, ContainerState.RUNNING,
Resource.newInstance(1024, 1), "Dummy Queued OC",
ContainerExitStatus.INVALID, Priority.newInstance(5), 1234, "",
ExecutionType.OPPORTUNISTIC);
ExecutionType.OPPORTUNISTIC, -1);
NMContainerStatus runningOpp =
NMContainerStatus.newInstance(c2, 1, ContainerState.RUNNING,
Resource.newInstance(2048, 1), "Dummy Running OC",
ContainerExitStatus.INVALID, Priority.newInstance(6), 1234, "",
ExecutionType.OPPORTUNISTIC);
ExecutionType.OPPORTUNISTIC, -1);
NMContainerStatus runningGuar =
NMContainerStatus.newInstance(c3, 1, ContainerState.RUNNING,
Resource.newInstance(2048, 1), "Dummy Running GC",
ContainerExitStatus.INVALID, Priority.newInstance(6), 1234, "",
ExecutionType.GUARANTEED);
ExecutionType.GUARANTEED, -1);
req.setContainerStatuses(Arrays.asList(queuedOpp, runningOpp, runningGuar));
// trying to register a invalid node.
RegisterNodeManagerResponse response =

View File

@ -295,12 +295,13 @@ public Token createContainerToken(ContainerId containerId,
int containerVersion, NodeId nodeId, String appSubmitter,
Resource capability, Priority priority, long createTime,
LogAggregationContext logAggregationContext, String nodeLabelExp,
ContainerType containerType, ExecutionType executionType) {
ContainerType containerType, ExecutionType executionType,
long allocationRequestId) {
numRetries++;
return super.createContainerToken(containerId, containerVersion,
nodeId, appSubmitter, capability, priority, createTime,
logAggregationContext, nodeLabelExp, containerType,
executionType);
executionType, allocationRequestId);
}
};
}