YARN-6955. Handle concurrent register AM requests in FederationInterceptor. (Botong Huang via Subru).
This commit is contained in:
parent
bc206806da
commit
c61f2c4198
@ -594,11 +594,9 @@
|
||||
<Bug pattern="UL_UNRELEASED_LOCK_EXCEPTION_PATH" />
|
||||
</Match>
|
||||
|
||||
<!-- Ignore false alert for RCN_REDUNDANT_NULLCHECK_OF_NULL_VALUE -->
|
||||
<Match>
|
||||
<Class name="org.apache.hadoop.yarn.server.nodemanager.amrmproxy.FederationInterceptor" />
|
||||
<Method name="registerApplicationMaster" />
|
||||
<Bug pattern="RCN_REDUNDANT_NULLCHECK_OF_NULL_VALUE" />
|
||||
<Bug pattern="IS2_INCONSISTENT_SYNC" />
|
||||
</Match>
|
||||
|
||||
</FindBugsFilter>
|
||||
|
@ -246,6 +246,16 @@ public RegisterApplicationMasterResponse registerApplicationMaster(
|
||||
|
||||
shouldReRegisterNext = false;
|
||||
|
||||
synchronized (applicationContainerIdMap) {
|
||||
if (applicationContainerIdMap.containsKey(amrmToken)) {
|
||||
throw new InvalidApplicationMasterRequestException(
|
||||
AMRMClientUtils.APP_ALREADY_REGISTERED_MESSAGE);
|
||||
}
|
||||
// Keep track of the containers that are returned to this application
|
||||
applicationContainerIdMap.put(amrmToken, new ArrayList<ContainerId>());
|
||||
}
|
||||
|
||||
// Make sure we wait for certain test cases last in the method
|
||||
synchronized (syncObj) {
|
||||
syncObj.notifyAll();
|
||||
// We reuse the port number to indicate whether the unit test want us to
|
||||
@ -261,14 +271,6 @@ public RegisterApplicationMasterResponse registerApplicationMaster(
|
||||
}
|
||||
}
|
||||
|
||||
synchronized (applicationContainerIdMap) {
|
||||
if (applicationContainerIdMap.containsKey(amrmToken)) {
|
||||
throw new InvalidApplicationMasterRequestException(
|
||||
AMRMClientUtils.APP_ALREADY_REGISTERED_MESSAGE);
|
||||
}
|
||||
// Keep track of the containers that are returned to this application
|
||||
applicationContainerIdMap.put(amrmToken, new ArrayList<ContainerId>());
|
||||
}
|
||||
return RegisterApplicationMasterResponse.newInstance(null, null, null, null,
|
||||
null, request.getHost(), null);
|
||||
}
|
||||
|
@ -208,22 +208,25 @@ public void init(AMRMProxyApplicationContext appContext) {
|
||||
* requests from AM because of timeout between AM and AMRMProxy, which is
|
||||
* shorter than the timeout + failOver between FederationInterceptor
|
||||
* (AMRMProxy) and RM.
|
||||
*
|
||||
* For the same reason, this method needs to be synchronized.
|
||||
*/
|
||||
@Override
|
||||
public RegisterApplicationMasterResponse registerApplicationMaster(
|
||||
RegisterApplicationMasterRequest request)
|
||||
throws YarnException, IOException {
|
||||
public synchronized RegisterApplicationMasterResponse
|
||||
registerApplicationMaster(RegisterApplicationMasterRequest request)
|
||||
throws YarnException, IOException {
|
||||
// If AM is calling with a different request, complain
|
||||
if (this.amRegistrationRequest != null
|
||||
&& !this.amRegistrationRequest.equals(request)) {
|
||||
throw new YarnException("A different request body recieved. AM should"
|
||||
+ " not call registerApplicationMaster with different request body");
|
||||
if (this.amRegistrationRequest != null) {
|
||||
if (!this.amRegistrationRequest.equals(request)) {
|
||||
throw new YarnException("AM should not call "
|
||||
+ "registerApplicationMaster with a different request body");
|
||||
}
|
||||
} else {
|
||||
// Save the registration request. This will be used for registering with
|
||||
// secondary sub-clusters using UAMs, as well as re-register later
|
||||
this.amRegistrationRequest = request;
|
||||
}
|
||||
|
||||
// Save the registration request. This will be used for registering with
|
||||
// secondary sub-clusters using UAMs, as well as re-register later
|
||||
this.amRegistrationRequest = request;
|
||||
|
||||
/*
|
||||
* Present to AM as if we are the RM that never fails over. When actual RM
|
||||
* fails over, we always re-register automatically.
|
||||
@ -245,22 +248,8 @@ public RegisterApplicationMasterResponse registerApplicationMaster(
|
||||
* is running and will breaks the elasticity feature. The registration with
|
||||
* the other sub-cluster RM will be done lazily as needed later.
|
||||
*/
|
||||
try {
|
||||
this.amRegistrationResponse =
|
||||
this.homeRM.registerApplicationMaster(request);
|
||||
} catch (InvalidApplicationMasterRequestException e) {
|
||||
if (e.getMessage()
|
||||
.contains(AMRMClientUtils.APP_ALREADY_REGISTERED_MESSAGE)) {
|
||||
// Some other register thread might have succeeded in the meantime
|
||||
if (this.amRegistrationResponse != null) {
|
||||
LOG.info("Other concurrent thread registered successfully, "
|
||||
+ "simply return the same success register response");
|
||||
return this.amRegistrationResponse;
|
||||
}
|
||||
}
|
||||
// This is a real issue, throw back to AM
|
||||
throw e;
|
||||
}
|
||||
this.amRegistrationResponse =
|
||||
this.homeRM.registerApplicationMaster(request);
|
||||
|
||||
// the queue this application belongs will be used for getting
|
||||
// AMRMProxy policy from state store.
|
||||
|
@ -21,6 +21,11 @@
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorCompletionService;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
|
||||
@ -36,6 +41,7 @@
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.InvalidApplicationMasterRequestException;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.MockResourceManagerFacade;
|
||||
import org.apache.hadoop.yarn.server.federation.policies.manager.UniformBroadcastPolicyManager;
|
||||
import org.apache.hadoop.yarn.server.federation.store.impl.MemoryFederationStateStore;
|
||||
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterDeregisterRequest;
|
||||
@ -234,7 +240,7 @@ public void testMultipleSubClusters() throws Exception {
|
||||
RegisterApplicationMasterRequest registerReq =
|
||||
Records.newRecord(RegisterApplicationMasterRequest.class);
|
||||
registerReq.setHost(Integer.toString(testAppId));
|
||||
registerReq.setRpcPort(testAppId);
|
||||
registerReq.setRpcPort(0);
|
||||
registerReq.setTrackingUrl("");
|
||||
|
||||
RegisterApplicationMasterResponse registerResponse =
|
||||
@ -298,7 +304,7 @@ public void testReregister() throws Exception {
|
||||
RegisterApplicationMasterRequest registerReq =
|
||||
Records.newRecord(RegisterApplicationMasterRequest.class);
|
||||
registerReq.setHost(Integer.toString(testAppId));
|
||||
registerReq.setRpcPort(testAppId);
|
||||
registerReq.setRpcPort(0);
|
||||
registerReq.setTrackingUrl("");
|
||||
|
||||
RegisterApplicationMasterResponse registerResponse =
|
||||
@ -338,6 +344,78 @@ public void testReregister() throws Exception {
|
||||
Assert.assertEquals(true, finshResponse.getIsUnregistered());
|
||||
}
|
||||
|
||||
/*
|
||||
* Test concurrent register threads. This is possible because the timeout
|
||||
* between AM and AMRMProxy is shorter than the timeout + failOver between
|
||||
* FederationInterceptor (AMRMProxy) and RM. When first call is blocked due to
|
||||
* RM failover and AM timeout, it will call us resulting in a second register
|
||||
* thread.
|
||||
*/
|
||||
@Test(timeout = 5000)
|
||||
public void testConcurrentRegister()
|
||||
throws InterruptedException, ExecutionException {
|
||||
ExecutorService threadpool = Executors.newCachedThreadPool();
|
||||
ExecutorCompletionService<RegisterApplicationMasterResponse> compSvc =
|
||||
new ExecutorCompletionService<>(threadpool);
|
||||
|
||||
Object syncObj = MockResourceManagerFacade.getSyncObj();
|
||||
|
||||
// Two register threads
|
||||
synchronized (syncObj) {
|
||||
// Make sure first thread will block within RM, before the second thread
|
||||
// starts
|
||||
LOG.info("Starting first register thread");
|
||||
compSvc.submit(new ConcurrentRegisterAMCallable());
|
||||
|
||||
try {
|
||||
LOG.info("Test main starts waiting for the first thread to block");
|
||||
syncObj.wait();
|
||||
LOG.info("Test main wait finished");
|
||||
} catch (Exception e) {
|
||||
LOG.info("Test main wait interrupted", e);
|
||||
}
|
||||
}
|
||||
|
||||
// The second thread will get already registered exception from RM.
|
||||
LOG.info("Starting second register thread");
|
||||
compSvc.submit(new ConcurrentRegisterAMCallable());
|
||||
|
||||
// Notify the first register thread to return
|
||||
LOG.info("Let first blocked register thread move on");
|
||||
synchronized (syncObj) {
|
||||
syncObj.notifyAll();
|
||||
}
|
||||
|
||||
// Both thread should return without exception
|
||||
RegisterApplicationMasterResponse response = compSvc.take().get();
|
||||
Assert.assertNotNull(response);
|
||||
|
||||
response = compSvc.take().get();
|
||||
Assert.assertNotNull(response);
|
||||
|
||||
threadpool.shutdown();
|
||||
}
|
||||
|
||||
/**
|
||||
* A callable that calls registerAM to RM with blocking.
|
||||
*/
|
||||
public class ConcurrentRegisterAMCallable
|
||||
implements Callable<RegisterApplicationMasterResponse> {
|
||||
@Override
|
||||
public RegisterApplicationMasterResponse call() throws Exception {
|
||||
RegisterApplicationMasterResponse response = null;
|
||||
try {
|
||||
// Use port number 1001 to let mock RM block in the register call
|
||||
response = interceptor.registerApplicationMaster(
|
||||
RegisterApplicationMasterRequest.newInstance(null, 1001, null));
|
||||
} catch (Exception e) {
|
||||
LOG.info("Register thread exception", e);
|
||||
response = null;
|
||||
}
|
||||
return response;
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRequestInterceptorChainCreation() throws Exception {
|
||||
RequestInterceptor root =
|
||||
@ -381,7 +459,7 @@ public void testTwoIdenticalRegisterRequest() throws Exception {
|
||||
RegisterApplicationMasterRequest registerReq =
|
||||
Records.newRecord(RegisterApplicationMasterRequest.class);
|
||||
registerReq.setHost(Integer.toString(testAppId));
|
||||
registerReq.setRpcPort(testAppId);
|
||||
registerReq.setRpcPort(0);
|
||||
registerReq.setTrackingUrl("");
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
@ -397,7 +475,7 @@ public void testTwoDifferentRegisterRequest() throws Exception {
|
||||
RegisterApplicationMasterRequest registerReq =
|
||||
Records.newRecord(RegisterApplicationMasterRequest.class);
|
||||
registerReq.setHost(Integer.toString(testAppId));
|
||||
registerReq.setRpcPort(testAppId);
|
||||
registerReq.setRpcPort(0);
|
||||
registerReq.setTrackingUrl("");
|
||||
|
||||
RegisterApplicationMasterResponse registerResponse =
|
||||
@ -407,7 +485,7 @@ public void testTwoDifferentRegisterRequest() throws Exception {
|
||||
// Register the application second time with a different request obj
|
||||
registerReq = Records.newRecord(RegisterApplicationMasterRequest.class);
|
||||
registerReq.setHost(Integer.toString(testAppId));
|
||||
registerReq.setRpcPort(testAppId);
|
||||
registerReq.setRpcPort(0);
|
||||
registerReq.setTrackingUrl("different");
|
||||
try {
|
||||
registerResponse = interceptor.registerApplicationMaster(registerReq);
|
||||
|
Loading…
Reference in New Issue
Block a user