YARN-11502. Refactor AMRMProxy#FederationInterceptor#registerApplicationMaster. (#5705)
This commit is contained in:
parent
e6937d7076
commit
9de13f879a
@ -91,6 +91,7 @@
|
|||||||
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterInfo;
|
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterInfo;
|
||||||
import org.apache.hadoop.yarn.server.federation.utils.FederationRegistryClient;
|
import org.apache.hadoop.yarn.server.federation.utils.FederationRegistryClient;
|
||||||
import org.apache.hadoop.yarn.server.federation.utils.FederationStateStoreFacade;
|
import org.apache.hadoop.yarn.server.federation.utils.FederationStateStoreFacade;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
|
||||||
import org.apache.hadoop.yarn.server.uam.UnmanagedAMPoolManager;
|
import org.apache.hadoop.yarn.server.uam.UnmanagedAMPoolManager;
|
||||||
import org.apache.hadoop.yarn.util.AsyncCallback;
|
import org.apache.hadoop.yarn.util.AsyncCallback;
|
||||||
import org.apache.hadoop.yarn.util.ConverterUtils;
|
import org.apache.hadoop.yarn.util.ConverterUtils;
|
||||||
@ -570,9 +571,12 @@ private Map<String, Token<AMRMTokenIdentifier>> recoverSubClusterAMRMTokenIdenti
|
|||||||
* For the same reason, this method needs to be synchronized.
|
* For the same reason, this method needs to be synchronized.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public synchronized RegisterApplicationMasterResponse
|
public synchronized RegisterApplicationMasterResponse registerApplicationMaster(
|
||||||
registerApplicationMaster(RegisterApplicationMasterRequest request)
|
RegisterApplicationMasterRequest request) throws YarnException, IOException {
|
||||||
throws YarnException, IOException {
|
|
||||||
|
if (request == null) {
|
||||||
|
throw new YarnException("RegisterApplicationMasterRequest can't be null!");
|
||||||
|
}
|
||||||
|
|
||||||
// Reset the heartbeat responseId to zero upon register
|
// Reset the heartbeat responseId to zero upon register
|
||||||
synchronized (this.lastAllocateResponseLock) {
|
synchronized (this.lastAllocateResponseLock) {
|
||||||
@ -590,18 +594,9 @@ private Map<String, Token<AMRMTokenIdentifier>> recoverSubClusterAMRMTokenIdenti
|
|||||||
// Save the registration request. This will be used for registering with
|
// Save the registration request. This will be used for registering with
|
||||||
// secondary sub-clusters using UAMs, as well as re-register later
|
// secondary sub-clusters using UAMs, as well as re-register later
|
||||||
this.amRegistrationRequest = request;
|
this.amRegistrationRequest = request;
|
||||||
if (getNMStateStore() != null) {
|
RegisterApplicationMasterRequestPBImpl requestPB = (RegisterApplicationMasterRequestPBImpl)
|
||||||
try {
|
this.amRegistrationRequest;
|
||||||
RegisterApplicationMasterRequestPBImpl pb =
|
storeAMRMProxyAppContextEntry(NMSS_REG_REQUEST_KEY, requestPB.getProto().toByteArray());
|
||||||
(RegisterApplicationMasterRequestPBImpl)
|
|
||||||
this.amRegistrationRequest;
|
|
||||||
getNMStateStore().storeAMRMProxyAppContextEntry(this.attemptId,
|
|
||||||
NMSS_REG_REQUEST_KEY, pb.getProto().toByteArray());
|
|
||||||
} catch (Exception e) {
|
|
||||||
LOG.error("Error storing AMRMProxy application context entry for "
|
|
||||||
+ this.attemptId, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -625,54 +620,64 @@ private Map<String, Token<AMRMTokenIdentifier>> recoverSubClusterAMRMTokenIdenti
|
|||||||
* is running and will breaks the elasticity feature. The registration with
|
* is running and will breaks the elasticity feature. The registration with
|
||||||
* the other sub-cluster RM will be done lazily as needed later.
|
* the other sub-cluster RM will be done lazily as needed later.
|
||||||
*/
|
*/
|
||||||
this.amRegistrationResponse =
|
this.amRegistrationResponse = this.homeRMRelayer.registerApplicationMaster(request);
|
||||||
this.homeRMRelayer.registerApplicationMaster(request);
|
|
||||||
if (this.amRegistrationResponse
|
if (this.amRegistrationResponse == null) {
|
||||||
.getContainersFromPreviousAttempts() != null) {
|
throw new YarnException("RegisterApplicationMasterResponse can't be null!");
|
||||||
cacheAllocatedContainers(
|
}
|
||||||
this.amRegistrationResponse.getContainersFromPreviousAttempts(),
|
|
||||||
this.homeSubClusterId);
|
List<Container> containersFromPreviousAttempts =
|
||||||
|
this.amRegistrationResponse.getContainersFromPreviousAttempts();
|
||||||
|
if (containersFromPreviousAttempts != null) {
|
||||||
|
cacheAllocatedContainers(containersFromPreviousAttempts, this.homeSubClusterId);
|
||||||
}
|
}
|
||||||
|
|
||||||
ApplicationId appId = this.attemptId.getApplicationId();
|
ApplicationId appId = this.attemptId.getApplicationId();
|
||||||
reAttachUAMAndMergeRegisterResponse(this.amRegistrationResponse, appId);
|
reAttachUAMAndMergeRegisterResponse(this.amRegistrationResponse, appId);
|
||||||
|
|
||||||
if (getNMStateStore() != null) {
|
RegisterApplicationMasterResponsePBImpl responsePB = (RegisterApplicationMasterResponsePBImpl)
|
||||||
try {
|
this.amRegistrationResponse;
|
||||||
RegisterApplicationMasterResponsePBImpl pb =
|
storeAMRMProxyAppContextEntry(NMSS_REG_RESPONSE_KEY, responsePB.getProto().toByteArray());
|
||||||
(RegisterApplicationMasterResponsePBImpl)
|
|
||||||
this.amRegistrationResponse;
|
|
||||||
getNMStateStore().storeAMRMProxyAppContextEntry(this.attemptId,
|
|
||||||
NMSS_REG_RESPONSE_KEY, pb.getProto().toByteArray());
|
|
||||||
} catch (Exception e) {
|
|
||||||
LOG.error("Error storing AMRMProxy application context entry for "
|
|
||||||
+ this.attemptId, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// the queue this application belongs will be used for getting
|
// the queue this application belongs will be used for getting
|
||||||
// AMRMProxy policy from state store.
|
// AMRMProxy policy from state store.
|
||||||
String queue = this.amRegistrationResponse.getQueue();
|
String queue = this.amRegistrationResponse.getQueue();
|
||||||
if (queue == null) {
|
if (queue == null) {
|
||||||
LOG.warn("Received null queue for application " + appId
|
LOG.warn("Received null queue for application {} from home subcluster. " +
|
||||||
+ " from home subcluster. Will use default queue name "
|
" Will use default queue name {} for getting AMRMProxyPolicy.", appId,
|
||||||
+ YarnConfiguration.DEFAULT_QUEUE_NAME
|
YarnConfiguration.DEFAULT_QUEUE_NAME);
|
||||||
+ " for getting AMRMProxyPolicy");
|
|
||||||
} else {
|
} else {
|
||||||
LOG.info("Application " + appId + " belongs to queue " + queue);
|
LOG.info("Application {} belongs to queue {}.", appId, queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize the AMRMProxyPolicy
|
// Initialize the AMRMProxyPolicy
|
||||||
try {
|
try {
|
||||||
this.policyInterpreter =
|
this.policyInterpreter = FederationPolicyUtils.loadAMRMPolicy(queue, this.policyInterpreter,
|
||||||
FederationPolicyUtils.loadAMRMPolicy(queue, this.policyInterpreter,
|
getConf(), this.federationFacade, this.homeSubClusterId);
|
||||||
getConf(), this.federationFacade, this.homeSubClusterId);
|
|
||||||
} catch (FederationPolicyInitializationException e) {
|
} catch (FederationPolicyInitializationException e) {
|
||||||
throw new YarnRuntimeException(e);
|
throw new YarnRuntimeException(e);
|
||||||
}
|
}
|
||||||
return this.amRegistrationResponse;
|
return this.amRegistrationResponse;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add a context entry for an application attempt in AMRMProxyService.
|
||||||
|
*
|
||||||
|
* @param key key string
|
||||||
|
* @param data state data
|
||||||
|
*/
|
||||||
|
private void storeAMRMProxyAppContextEntry(String key, byte[] data) {
|
||||||
|
NMStateStoreService nmStateStore = getNMStateStore();
|
||||||
|
if (nmStateStore != null) {
|
||||||
|
try {
|
||||||
|
nmStateStore.storeAMRMProxyAppContextEntry(this.attemptId, key, data);
|
||||||
|
} catch (Exception e) {
|
||||||
|
LOG.error("Error storing AMRMProxy application context entry[{}] for {}.",
|
||||||
|
key, this.attemptId, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sends the heart beats to the home RM and the secondary sub-cluster RMs that
|
* Sends the heart beats to the home RM and the secondary sub-cluster RMs that
|
||||||
* are being used by the application.
|
* are being used by the application.
|
||||||
|
@ -195,10 +195,8 @@ private void deRegisterSubCluster(SubClusterId subClusterId)
|
|||||||
private List<Container> getContainersAndAssert(int numberOfResourceRequests,
|
private List<Container> getContainersAndAssert(int numberOfResourceRequests,
|
||||||
int numberOfAllocationExcepted) throws Exception {
|
int numberOfAllocationExcepted) throws Exception {
|
||||||
AllocateRequest allocateRequest = Records.newRecord(AllocateRequest.class);
|
AllocateRequest allocateRequest = Records.newRecord(AllocateRequest.class);
|
||||||
List<Container> containers =
|
List<Container> containers = new ArrayList<>(numberOfResourceRequests);
|
||||||
new ArrayList<Container>(numberOfResourceRequests);
|
List<ResourceRequest> askList = new ArrayList<>(numberOfResourceRequests);
|
||||||
List<ResourceRequest> askList =
|
|
||||||
new ArrayList<ResourceRequest>(numberOfResourceRequests);
|
|
||||||
for (int id = 0; id < numberOfResourceRequests; id++) {
|
for (int id = 0; id < numberOfResourceRequests; id++) {
|
||||||
askList.add(createResourceRequest("test-node-" + Integer.toString(id),
|
askList.add(createResourceRequest("test-node-" + Integer.toString(id),
|
||||||
6000, 2, id % 5, 1));
|
6000, 2, id % 5, 1));
|
||||||
@ -269,8 +267,8 @@ private void releaseContainersAndAssert(List<Container> containers)
|
|||||||
List<ContainerId> newlyFinished = getCompletedContainerIds(
|
List<ContainerId> newlyFinished = getCompletedContainerIds(
|
||||||
allocateResponse.getCompletedContainersStatuses());
|
allocateResponse.getCompletedContainersStatuses());
|
||||||
containersForReleasedContainerIds.addAll(newlyFinished);
|
containersForReleasedContainerIds.addAll(newlyFinished);
|
||||||
LOG.info("Number of containers received in the original request: "
|
LOG.info("Number of containers received in the original request: {}",
|
||||||
+ Integer.toString(newlyFinished.size()));
|
newlyFinished.size());
|
||||||
|
|
||||||
// Send max 10 heart beats to receive all the containers. If not, we will
|
// Send max 10 heart beats to receive all the containers. If not, we will
|
||||||
// fail the test
|
// fail the test
|
||||||
@ -290,10 +288,9 @@ private void releaseContainersAndAssert(List<Container> containers)
|
|||||||
newlyFinished = getCompletedContainerIds(
|
newlyFinished = getCompletedContainerIds(
|
||||||
allocateResponse.getCompletedContainersStatuses());
|
allocateResponse.getCompletedContainersStatuses());
|
||||||
containersForReleasedContainerIds.addAll(newlyFinished);
|
containersForReleasedContainerIds.addAll(newlyFinished);
|
||||||
LOG.info("Number of containers received in this request: "
|
LOG.info("Number of containers received in this request: {}.", newlyFinished.size());
|
||||||
+ Integer.toString(newlyFinished.size()));
|
LOG.info("Total number of containers received: {}.",
|
||||||
LOG.info("Total number of containers received: "
|
containersForReleasedContainerIds.size());
|
||||||
+ Integer.toString(containersForReleasedContainerIds.size()));
|
|
||||||
Thread.sleep(10);
|
Thread.sleep(10);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -431,7 +428,7 @@ public Object run() throws Exception {
|
|||||||
FinishApplicationMasterResponse finishResponse =
|
FinishApplicationMasterResponse finishResponse =
|
||||||
interceptor.finishApplicationMaster(finishReq);
|
interceptor.finishApplicationMaster(finishReq);
|
||||||
Assert.assertNotNull(finishResponse);
|
Assert.assertNotNull(finishResponse);
|
||||||
Assert.assertEquals(true, finishResponse.getIsUnregistered());
|
Assert.assertTrue(finishResponse.getIsUnregistered());
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -624,7 +621,7 @@ public Object run() throws Exception {
|
|||||||
FinishApplicationMasterResponse finishResponse =
|
FinishApplicationMasterResponse finishResponse =
|
||||||
interceptor.finishApplicationMaster(finishReq);
|
interceptor.finishApplicationMaster(finishReq);
|
||||||
Assert.assertNotNull(finishResponse);
|
Assert.assertNotNull(finishResponse);
|
||||||
Assert.assertEquals(true, finishResponse.getIsUnregistered());
|
Assert.assertTrue(finishResponse.getIsUnregistered());
|
||||||
|
|
||||||
// After the application succeeds, the registry/NMSS entry should be
|
// After the application succeeds, the registry/NMSS entry should be
|
||||||
// cleaned up
|
// cleaned up
|
||||||
@ -805,7 +802,7 @@ public Object run() throws Exception {
|
|||||||
Assert.assertEquals(0, interceptor.getTimedOutSCs(true).size());
|
Assert.assertEquals(0, interceptor.getTimedOutSCs(true).size());
|
||||||
|
|
||||||
// Generate a duplicate heartbeat from AM, so that it won't really
|
// Generate a duplicate heartbeat from AM, so that it won't really
|
||||||
// trigger an heartbeat to all SC
|
// trigger a heartbeat to all SC
|
||||||
AllocateRequest allocateRequest =
|
AllocateRequest allocateRequest =
|
||||||
Records.newRecord(AllocateRequest.class);
|
Records.newRecord(AllocateRequest.class);
|
||||||
// Set to lastResponseId - 1 so that it will be considered a duplicate
|
// Set to lastResponseId - 1 so that it will be considered a duplicate
|
||||||
@ -904,7 +901,7 @@ public Object run() throws Exception {
|
|||||||
FinishApplicationMasterResponse finishResponse =
|
FinishApplicationMasterResponse finishResponse =
|
||||||
interceptor.finishApplicationMaster(finishReq);
|
interceptor.finishApplicationMaster(finishReq);
|
||||||
Assert.assertNotNull(finishResponse);
|
Assert.assertNotNull(finishResponse);
|
||||||
Assert.assertEquals(true, finishResponse.getIsUnregistered());
|
Assert.assertTrue(finishResponse.getIsUnregistered());
|
||||||
|
|
||||||
// After the application succeeds, the registry entry should be deleted
|
// After the application succeeds, the registry entry should be deleted
|
||||||
if (interceptor.getRegistryClient() != null) {
|
if (interceptor.getRegistryClient() != null) {
|
||||||
|
Loading…
Reference in New Issue
Block a user