YARN-11509. The FederationInterceptor#launchUAM Added retry logic. (#5727)
This commit is contained in:
parent
33b1677e9e
commit
8b88e9f8f4
@ -4058,6 +4058,20 @@ public static boolean isAclEnabled(Configuration conf) {
|
|||||||
public static final long DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT =
|
public static final long DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT =
|
||||||
60000; // one minute
|
60000; // one minute
|
||||||
|
|
||||||
|
// AMRMProxy Register UAM Retry-Num
|
||||||
|
public static final String FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT =
|
||||||
|
FEDERATION_PREFIX + "amrmproxy.register.uam.retry-count";
|
||||||
|
// Register a UAM , we will retry a maximum of 3 times.
|
||||||
|
public static final int DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT =
|
||||||
|
3;
|
||||||
|
|
||||||
|
// AMRMProxy Register UAM Retry Interval
|
||||||
|
public static final String FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL =
|
||||||
|
FEDERATION_PREFIX + "amrmproxy.register.uam.interval";
|
||||||
|
// Retry Interval, default 100 ms
|
||||||
|
public static final long DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL =
|
||||||
|
TimeUnit.MILLISECONDS.toMillis(100);
|
||||||
|
|
||||||
public static final String DEFAULT_FEDERATION_POLICY_KEY = "*";
|
public static final String DEFAULT_FEDERATION_POLICY_KEY = "*";
|
||||||
public static final String FEDERATION_POLICY_MANAGER = FEDERATION_PREFIX
|
public static final String FEDERATION_POLICY_MANAGER = FEDERATION_PREFIX
|
||||||
+ "policy-manager";
|
+ "policy-manager";
|
||||||
|
@ -5408,4 +5408,22 @@
|
|||||||
</description>
|
</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>
|
||||||
|
The number of retry for Register UAM.
|
||||||
|
The default value is 3.
|
||||||
|
</description>
|
||||||
|
<name>yarn.federation.amrmproxy.register.uam.retry-count</name>
|
||||||
|
<value>3</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>
|
||||||
|
Interval between retry for Register UAM.
|
||||||
|
The default value is 100ms.
|
||||||
|
</description>
|
||||||
|
<name>yarn.federation.amrmproxy.register.uam.interval</name>
|
||||||
|
<value>100ms</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
</configuration>
|
</configuration>
|
||||||
|
@ -36,6 +36,7 @@
|
|||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
@ -87,6 +88,7 @@
|
|||||||
import org.apache.hadoop.yarn.server.federation.policies.amrmproxy.FederationAMRMProxyPolicy;
|
import org.apache.hadoop.yarn.server.federation.policies.amrmproxy.FederationAMRMProxyPolicy;
|
||||||
import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyInitializationException;
|
import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyInitializationException;
|
||||||
import org.apache.hadoop.yarn.server.federation.resolver.SubClusterResolver;
|
import org.apache.hadoop.yarn.server.federation.resolver.SubClusterResolver;
|
||||||
|
import org.apache.hadoop.yarn.server.federation.retry.FederationActionRetry;
|
||||||
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterId;
|
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterId;
|
||||||
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterInfo;
|
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterInfo;
|
||||||
import org.apache.hadoop.yarn.server.federation.utils.FederationRegistryClient;
|
import org.apache.hadoop.yarn.server.federation.utils.FederationRegistryClient;
|
||||||
@ -251,6 +253,10 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
|||||||
// the maximum wait time for the first async heart beat response
|
// the maximum wait time for the first async heart beat response
|
||||||
private long heartbeatMaxWaitTimeMs;
|
private long heartbeatMaxWaitTimeMs;
|
||||||
|
|
||||||
|
private int registerUamRetryNum;
|
||||||
|
|
||||||
|
private long registerUamRetryInterval;
|
||||||
|
|
||||||
private boolean waitUamRegisterDone;
|
private boolean waitUamRegisterDone;
|
||||||
|
|
||||||
private MonotonicClock clock = new MonotonicClock();
|
private MonotonicClock clock = new MonotonicClock();
|
||||||
@ -355,6 +361,24 @@ public void init(AMRMProxyApplicationContext appContext) {
|
|||||||
this.subClusterTimeOut =
|
this.subClusterTimeOut =
|
||||||
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT;
|
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.registerUamRetryNum = conf.getInt(
|
||||||
|
YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT,
|
||||||
|
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT);
|
||||||
|
if (this.registerUamRetryNum <= 0) {
|
||||||
|
LOG.info("{} configured to be {}, should be positive. Using default of {}.",
|
||||||
|
YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT,
|
||||||
|
this.subClusterTimeOut,
|
||||||
|
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT);
|
||||||
|
this.registerUamRetryNum =
|
||||||
|
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.registerUamRetryInterval = conf.getTimeDuration(
|
||||||
|
YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL,
|
||||||
|
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL,
|
||||||
|
TimeUnit.MILLISECONDS);
|
||||||
|
|
||||||
this.waitUamRegisterDone = conf.getBoolean(YarnConfiguration.AMRM_PROXY_WAIT_UAM_REGISTER_DONE,
|
this.waitUamRegisterDone = conf.getBoolean(YarnConfiguration.AMRM_PROXY_WAIT_UAM_REGISTER_DONE,
|
||||||
YarnConfiguration.DEFAULT_AMRM_PROXY_WAIT_UAM_REGISTER_DONE);
|
YarnConfiguration.DEFAULT_AMRM_PROXY_WAIT_UAM_REGISTER_DONE);
|
||||||
}
|
}
|
||||||
@ -701,7 +725,7 @@ public AllocateResponse allocate(AllocateRequest request)
|
|||||||
|
|
||||||
if (this.finishAMCalled) {
|
if (this.finishAMCalled) {
|
||||||
LOG.warn("FinishApplicationMaster already called by {}, skip heartbeat "
|
LOG.warn("FinishApplicationMaster already called by {}, skip heartbeat "
|
||||||
+ "processing and return dummy response" + this.attemptId);
|
+ "processing and return dummy response.", this.attemptId);
|
||||||
return RECORD_FACTORY.newRecordInstance(AllocateResponse.class);
|
return RECORD_FACTORY.newRecordInstance(AllocateResponse.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1255,85 +1279,77 @@ private List<SubClusterId> registerAndAllocateWithNewSubClusters(
|
|||||||
// Check to see if there are any new sub-clusters in this request
|
// Check to see if there are any new sub-clusters in this request
|
||||||
// list and create and register Unmanaged AM instance for the new ones
|
// list and create and register Unmanaged AM instance for the new ones
|
||||||
List<SubClusterId> newSubClusters = new ArrayList<>();
|
List<SubClusterId> newSubClusters = new ArrayList<>();
|
||||||
for (SubClusterId subClusterId : requests.keySet()) {
|
|
||||||
if (!subClusterId.equals(this.homeSubClusterId)
|
|
||||||
&& !this.uamPool.hasUAMId(subClusterId.getId())) {
|
|
||||||
newSubClusters.add(subClusterId);
|
|
||||||
|
|
||||||
|
requests.keySet().stream().forEach(subClusterId -> {
|
||||||
|
String id = subClusterId.getId();
|
||||||
|
if (!subClusterId.equals(this.homeSubClusterId) && !this.uamPool.hasUAMId(id)) {
|
||||||
|
newSubClusters.add(subClusterId);
|
||||||
// Set sub-cluster to be timed out initially
|
// Set sub-cluster to be timed out initially
|
||||||
lastSCResponseTime.put(subClusterId,
|
lastSCResponseTime.put(subClusterId, clock.getTime() - subClusterTimeOut);
|
||||||
clock.getTime() - subClusterTimeOut);
|
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
this.uamRegisterFutures.clear();
|
this.uamRegisterFutures.clear();
|
||||||
|
|
||||||
for (final SubClusterId scId : newSubClusters) {
|
for (final SubClusterId scId : newSubClusters) {
|
||||||
Future<?> future = this.threadpool.submit(new Runnable() {
|
|
||||||
@Override
|
|
||||||
public void run() {
|
|
||||||
String subClusterId = scId.getId();
|
|
||||||
|
|
||||||
// Create a config loaded with federation on and subclusterId
|
Future<?> future = this.threadpool.submit(() -> {
|
||||||
// for each UAM
|
|
||||||
YarnConfiguration config = new YarnConfiguration(getConf());
|
|
||||||
FederationProxyProviderUtil.updateConfForFederation(config,
|
|
||||||
subClusterId);
|
|
||||||
|
|
||||||
RegisterApplicationMasterResponse uamResponse = null;
|
String subClusterId = scId.getId();
|
||||||
Token<AMRMTokenIdentifier> token = null;
|
|
||||||
try {
|
|
||||||
ApplicationId applicationId = attemptId.getApplicationId();
|
|
||||||
ApplicationSubmissionContext originalSubmissionContext =
|
|
||||||
federationFacade.getApplicationSubmissionContext(applicationId);
|
|
||||||
|
|
||||||
// For appNameSuffix, use subClusterId of the home sub-cluster
|
// Create a config loaded with federation on and subclusterId
|
||||||
token = uamPool.launchUAM(subClusterId, config,
|
// for each UAM
|
||||||
applicationId, amRegistrationResponse.getQueue(),
|
YarnConfiguration config = new YarnConfiguration(getConf());
|
||||||
getApplicationContext().getUser(), homeSubClusterId.toString(),
|
FederationProxyProviderUtil.updateConfForFederation(config, subClusterId);
|
||||||
true, subClusterId, originalSubmissionContext);
|
ApplicationId applicationId = attemptId.getApplicationId();
|
||||||
|
|
||||||
secondaryRelayers.put(subClusterId,
|
RegisterApplicationMasterResponse uamResponse;
|
||||||
uamPool.getAMRMClientRelayer(subClusterId));
|
Token<AMRMTokenIdentifier> token;
|
||||||
|
|
||||||
uamResponse = uamPool.registerApplicationMaster(subClusterId,
|
// LaunchUAM And RegisterApplicationMaster
|
||||||
amRegistrationRequest);
|
try {
|
||||||
} catch (Throwable e) {
|
TokenAndRegisterResponse result =
|
||||||
LOG.error("Failed to register application master: " + subClusterId
|
((FederationActionRetry<TokenAndRegisterResponse>) (retryCount) ->
|
||||||
+ " Application: " + attemptId, e);
|
launchUAMAndRegisterApplicationMaster(config, subClusterId, applicationId)).
|
||||||
// TODO: UAM registration for this sub-cluster RM
|
runWithRetries(registerUamRetryNum, registerUamRetryInterval);
|
||||||
// failed. For now, we ignore the resource requests and continue
|
|
||||||
// but we need to fix this and handle this situation. One way would
|
token = result.getToken();
|
||||||
// be to send the request to another RM by consulting the policy.
|
uamResponse = result.getResponse();
|
||||||
return;
|
} catch (Throwable e) {
|
||||||
}
|
LOG.error("Failed to register application master: {} Application: {}.",
|
||||||
uamRegistrations.put(scId, uamResponse);
|
subClusterId, attemptId, e);
|
||||||
LOG.info("Successfully registered unmanaged application master: "
|
return;
|
||||||
+ subClusterId + " ApplicationId: " + attemptId);
|
}
|
||||||
|
|
||||||
try {
|
uamRegistrations.put(scId, uamResponse);
|
||||||
uamPool.allocateAsync(subClusterId, requests.get(scId),
|
|
||||||
new HeartbeatCallBack(scId, true));
|
LOG.info("Successfully registered unmanaged application master: {} " +
|
||||||
} catch (Throwable e) {
|
"ApplicationId: {}.", subClusterId, attemptId);
|
||||||
LOG.error("Failed to allocate async to " + subClusterId
|
|
||||||
+ " Application: " + attemptId, e);
|
// Allocate Request
|
||||||
}
|
try {
|
||||||
|
uamPool.allocateAsync(subClusterId, requests.get(scId),
|
||||||
// Save the UAM token in registry or NMSS
|
new HeartbeatCallBack(scId, true));
|
||||||
try {
|
} catch (Throwable e) {
|
||||||
if (registryClient != null) {
|
LOG.error("Failed to allocate async to {} Application: {}.",
|
||||||
registryClient.writeAMRMTokenForUAM(attemptId.getApplicationId(),
|
subClusterId, attemptId, e);
|
||||||
subClusterId, token);
|
}
|
||||||
} else if (getNMStateStore() != null) {
|
|
||||||
getNMStateStore().storeAMRMProxyAppContextEntry(attemptId,
|
// Save the UAM token in registry or NMSS
|
||||||
NMSS_SECONDARY_SC_PREFIX + subClusterId,
|
try {
|
||||||
token.encodeToUrlString().getBytes(STRING_TO_BYTE_FORMAT));
|
if (registryClient != null) {
|
||||||
}
|
registryClient.writeAMRMTokenForUAM(applicationId, subClusterId, token);
|
||||||
} catch (Throwable e) {
|
} else if (getNMStateStore() != null) {
|
||||||
LOG.error("Failed to persist UAM token from " + subClusterId
|
getNMStateStore().storeAMRMProxyAppContextEntry(attemptId,
|
||||||
+ " Application: " + attemptId, e);
|
NMSS_SECONDARY_SC_PREFIX + subClusterId,
|
||||||
|
token.encodeToUrlString().getBytes(STRING_TO_BYTE_FORMAT));
|
||||||
}
|
}
|
||||||
|
} catch (Throwable e) {
|
||||||
|
LOG.error("Failed to persist UAM token from {} Application {}",
|
||||||
|
subClusterId, attemptId, e);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
this.uamRegisterFutures.put(scId, future);
|
this.uamRegisterFutures.put(scId, future);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1347,10 +1363,34 @@ public void run() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
return newSubClusters;
|
return newSubClusters;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected TokenAndRegisterResponse launchUAMAndRegisterApplicationMaster(
|
||||||
|
YarnConfiguration config, String subClusterId, ApplicationId applicationId)
|
||||||
|
throws IOException, YarnException {
|
||||||
|
|
||||||
|
// Prepare parameter information
|
||||||
|
ApplicationSubmissionContext originalSubmissionContext =
|
||||||
|
federationFacade.getApplicationSubmissionContext(applicationId);
|
||||||
|
String submitter = getApplicationContext().getUser();
|
||||||
|
String homeRM = homeSubClusterId.toString();
|
||||||
|
String queue = amRegistrationResponse.getQueue();
|
||||||
|
|
||||||
|
// For appNameSuffix, use subClusterId of the home sub-cluster
|
||||||
|
Token<AMRMTokenIdentifier> token = uamPool.launchUAM(subClusterId, config, applicationId,
|
||||||
|
queue, submitter, homeRM, true, subClusterId, originalSubmissionContext);
|
||||||
|
|
||||||
|
// Set the relationship between SubCluster and AMRMClientRelayer.
|
||||||
|
secondaryRelayers.put(subClusterId, uamPool.getAMRMClientRelayer(subClusterId));
|
||||||
|
|
||||||
|
// RegisterApplicationMaster
|
||||||
|
RegisterApplicationMasterResponse uamResponse =
|
||||||
|
uamPool.registerApplicationMaster(subClusterId, amRegistrationRequest);
|
||||||
|
|
||||||
|
return new TokenAndRegisterResponse(token, uamResponse);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Prepare the base allocation response. Use lastSCResponse and
|
* Prepare the base allocation response. Use lastSCResponse and
|
||||||
* lastHeartbeatTimeStamp to assemble entries about cluster-wide info, e.g.
|
* lastHeartbeatTimeStamp to assemble entries about cluster-wide info, e.g.
|
||||||
|
@ -0,0 +1,45 @@
|
|||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;
|
||||||
|
|
||||||
|
import org.apache.hadoop.security.token.Token;
|
||||||
|
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
|
||||||
|
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class contains information about the AMRM token and the RegisterApplicationMasterResponse.
|
||||||
|
*/
|
||||||
|
public class TokenAndRegisterResponse {
|
||||||
|
private Token<AMRMTokenIdentifier> token;
|
||||||
|
private RegisterApplicationMasterResponse response;
|
||||||
|
|
||||||
|
public TokenAndRegisterResponse(Token<AMRMTokenIdentifier> pToken,
|
||||||
|
RegisterApplicationMasterResponse pResponse) {
|
||||||
|
this.token = pToken;
|
||||||
|
this.response = pResponse;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Token<AMRMTokenIdentifier> getToken() {
|
||||||
|
return token;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RegisterApplicationMasterResponse getResponse() {
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
}
|
@ -38,7 +38,6 @@
|
|||||||
import org.apache.hadoop.registry.client.api.RegistryOperations;
|
import org.apache.hadoop.registry.client.api.RegistryOperations;
|
||||||
import org.apache.hadoop.registry.client.impl.FSRegistryOperationsService;
|
import org.apache.hadoop.registry.client.impl.FSRegistryOperationsService;
|
||||||
import org.apache.hadoop.security.UserGroupInformation;
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
import org.apache.hadoop.test.GenericTestUtils;
|
|
||||||
import org.apache.hadoop.test.LambdaTestUtils;
|
import org.apache.hadoop.test.LambdaTestUtils;
|
||||||
import org.apache.hadoop.util.Time;
|
import org.apache.hadoop.util.Time;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
|
import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
|
||||||
@ -179,9 +178,8 @@ protected YarnConfiguration createConfiguration() {
|
|||||||
conf.setLong(YarnConfiguration.FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT,
|
conf.setLong(YarnConfiguration.FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT,
|
||||||
500);
|
500);
|
||||||
|
|
||||||
// Wait UAM Register Down
|
// Register UAM Retry Interval 1ms
|
||||||
conf.setBoolean(YarnConfiguration.AMRM_PROXY_WAIT_UAM_REGISTER_DONE, true);
|
conf.setLong(YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL, 1);
|
||||||
|
|
||||||
return conf;
|
return conf;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -597,10 +595,6 @@ public Object run() throws Exception {
|
|||||||
interceptor.recover(recoveredDataMap);
|
interceptor.recover(recoveredDataMap);
|
||||||
|
|
||||||
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
|
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
|
||||||
|
|
||||||
// Waiting for SC-1 to time out.
|
|
||||||
GenericTestUtils.waitFor(() -> interceptor.getTimedOutSCs(true).size() == 1, 100, 1000);
|
|
||||||
|
|
||||||
// SC1 should be initialized to be timed out
|
// SC1 should be initialized to be timed out
|
||||||
Assert.assertEquals(1, interceptor.getTimedOutSCs(true).size());
|
Assert.assertEquals(1, interceptor.getTimedOutSCs(true).size());
|
||||||
|
|
||||||
@ -859,7 +853,7 @@ public Object run() throws Exception {
|
|||||||
List<Container> containers =
|
List<Container> containers =
|
||||||
getContainersAndAssert(numberOfContainers, numberOfContainers * 2);
|
getContainersAndAssert(numberOfContainers, numberOfContainers * 2);
|
||||||
for (Container c : containers) {
|
for (Container c : containers) {
|
||||||
LOG.info("Allocated container {}", c.getId());
|
LOG.info("Allocated container " + c.getId());
|
||||||
}
|
}
|
||||||
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
|
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
|
||||||
|
|
||||||
@ -893,10 +887,6 @@ public Object run() throws Exception {
|
|||||||
int numberOfContainers = 3;
|
int numberOfContainers = 3;
|
||||||
// Should re-attach secondaries and get the three running containers
|
// Should re-attach secondaries and get the three running containers
|
||||||
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
|
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
|
||||||
|
|
||||||
// Waiting for SC-1 to time out.
|
|
||||||
GenericTestUtils.waitFor(() -> interceptor.getTimedOutSCs(true).size() == 1, 100, 1000);
|
|
||||||
|
|
||||||
// SC1 should be initialized to be timed out
|
// SC1 should be initialized to be timed out
|
||||||
Assert.assertEquals(1, interceptor.getTimedOutSCs(true).size());
|
Assert.assertEquals(1, interceptor.getTimedOutSCs(true).size());
|
||||||
Assert.assertEquals(numberOfContainers,
|
Assert.assertEquals(numberOfContainers,
|
||||||
@ -1444,4 +1434,53 @@ private void finishApplication() throws IOException, YarnException {
|
|||||||
Assert.assertNotNull(finishResponse);
|
Assert.assertNotNull(finishResponse);
|
||||||
Assert.assertTrue(finishResponse.getIsUnregistered());
|
Assert.assertTrue(finishResponse.getIsUnregistered());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLaunchUAMAndRegisterApplicationMasterRetry() throws Exception {
|
||||||
|
|
||||||
|
UserGroupInformation ugi = interceptor.getUGIWithToken(interceptor.getAttemptId());
|
||||||
|
interceptor.setRetryCount(2);
|
||||||
|
|
||||||
|
ugi.doAs((PrivilegedExceptionAction<Object>) () -> {
|
||||||
|
// Register the application
|
||||||
|
RegisterApplicationMasterRequest registerReq =
|
||||||
|
Records.newRecord(RegisterApplicationMasterRequest.class);
|
||||||
|
registerReq.setHost(Integer.toString(testAppId));
|
||||||
|
registerReq.setRpcPort(0);
|
||||||
|
registerReq.setTrackingUrl("");
|
||||||
|
|
||||||
|
RegisterApplicationMasterResponse registerResponse =
|
||||||
|
interceptor.registerApplicationMaster(registerReq);
|
||||||
|
Assert.assertNotNull(registerResponse);
|
||||||
|
lastResponseId = 0;
|
||||||
|
|
||||||
|
Assert.assertEquals(0, interceptor.getUnmanagedAMPoolSize());
|
||||||
|
|
||||||
|
// Allocate the first batch of containers, with sc1 active
|
||||||
|
registerSubCluster(SubClusterId.newInstance("SC-1"));
|
||||||
|
|
||||||
|
int numberOfContainers = 3;
|
||||||
|
List<Container> containers = getContainersAndAssert(numberOfContainers, numberOfContainers);
|
||||||
|
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
|
||||||
|
|
||||||
|
// Release all containers
|
||||||
|
releaseContainersAndAssert(containers);
|
||||||
|
|
||||||
|
// Finish the application
|
||||||
|
FinishApplicationMasterRequest finishReq =
|
||||||
|
Records.newRecord(FinishApplicationMasterRequest.class);
|
||||||
|
finishReq.setDiagnostics("");
|
||||||
|
finishReq.setTrackingUrl("");
|
||||||
|
finishReq.setFinalApplicationStatus(FinalApplicationStatus.SUCCEEDED);
|
||||||
|
|
||||||
|
FinishApplicationMasterResponse finishResponse =
|
||||||
|
interceptor.finishApplicationMaster(finishReq);
|
||||||
|
Assert.assertNotNull(finishResponse);
|
||||||
|
Assert.assertTrue(finishResponse.getIsUnregistered());
|
||||||
|
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
|
||||||
|
Assert.assertEquals(0, interceptor.getRetryCount());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -55,6 +55,7 @@ public class TestableFederationInterceptor extends FederationInterceptor {
|
|||||||
private MockResourceManagerFacade mockRm;
|
private MockResourceManagerFacade mockRm;
|
||||||
|
|
||||||
private boolean isClientRPC = false;
|
private boolean isClientRPC = false;
|
||||||
|
private int retryCount = 0;
|
||||||
|
|
||||||
public TestableFederationInterceptor() {
|
public TestableFederationInterceptor() {
|
||||||
}
|
}
|
||||||
@ -258,6 +259,24 @@ protected <T> T createRMProxy(Class<T> protocol, Configuration config,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected TokenAndRegisterResponse launchUAMAndRegisterApplicationMaster(YarnConfiguration config,
|
||||||
|
String subClusterId, ApplicationId applicationId) throws IOException, YarnException {
|
||||||
|
if (retryCount > 0) {
|
||||||
|
retryCount--;
|
||||||
|
throw new YarnException("launchUAMAndRegisterApplicationMaster will retry");
|
||||||
|
}
|
||||||
|
return super.launchUAMAndRegisterApplicationMaster(config, subClusterId, applicationId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRetryCount(int retryCount) {
|
||||||
|
this.retryCount = retryCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getRetryCount() {
|
||||||
|
return retryCount;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wrap the handler thread, so it calls from the same user.
|
* Wrap the handler thread, so it calls from the same user.
|
||||||
*/
|
*/
|
||||||
|
Loading…
Reference in New Issue
Block a user