YARN-11509. The FederationInterceptor#launchUAM Added retry logic. (#5727)

This commit is contained in:
slfan1989 2023-07-12 09:47:07 +08:00 committed by GitHub
parent 33b1677e9e
commit 8b88e9f8f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 255 additions and 80 deletions

View File

@ -4058,6 +4058,20 @@ public static boolean isAclEnabled(Configuration conf) {
public static final long DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT =
60000; // one minute
// AMRMProxy Register UAM Retry-Num
public static final String FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT =
FEDERATION_PREFIX + "amrmproxy.register.uam.retry-count";
// Register a UAM , we will retry a maximum of 3 times.
public static final int DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT =
3;
// AMRMProxy Register UAM Retry Interval
public static final String FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL =
FEDERATION_PREFIX + "amrmproxy.register.uam.interval";
// Retry Interval, default 100 ms
public static final long DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL =
TimeUnit.MILLISECONDS.toMillis(100);
public static final String DEFAULT_FEDERATION_POLICY_KEY = "*";
public static final String FEDERATION_POLICY_MANAGER = FEDERATION_PREFIX
+ "policy-manager";

View File

@ -5408,4 +5408,22 @@
</description>
</property>
<property>
<description>
The number of retry for Register UAM.
The default value is 3.
</description>
<name>yarn.federation.amrmproxy.register.uam.retry-count</name>
<value>3</value>
</property>
<property>
<description>
Interval between retry for Register UAM.
The default value is 100ms.
</description>
<name>yarn.federation.amrmproxy.register.uam.interval</name>
<value>100ms</value>
</property>
</configuration>

View File

@ -36,6 +36,7 @@
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.stream.Collectors;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
@ -87,6 +88,7 @@
import org.apache.hadoop.yarn.server.federation.policies.amrmproxy.FederationAMRMProxyPolicy;
import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyInitializationException;
import org.apache.hadoop.yarn.server.federation.resolver.SubClusterResolver;
import org.apache.hadoop.yarn.server.federation.retry.FederationActionRetry;
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterId;
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterInfo;
import org.apache.hadoop.yarn.server.federation.utils.FederationRegistryClient;
@ -251,6 +253,10 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
// the maximum wait time for the first async heart beat response
private long heartbeatMaxWaitTimeMs;
private int registerUamRetryNum;
private long registerUamRetryInterval;
private boolean waitUamRegisterDone;
private MonotonicClock clock = new MonotonicClock();
@ -355,6 +361,24 @@ public void init(AMRMProxyApplicationContext appContext) {
this.subClusterTimeOut =
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT;
}
this.registerUamRetryNum = conf.getInt(
YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT,
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT);
if (this.registerUamRetryNum <= 0) {
LOG.info("{} configured to be {}, should be positive. Using default of {}.",
YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT,
this.subClusterTimeOut,
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT);
this.registerUamRetryNum =
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT;
}
this.registerUamRetryInterval = conf.getTimeDuration(
YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL,
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL,
TimeUnit.MILLISECONDS);
this.waitUamRegisterDone = conf.getBoolean(YarnConfiguration.AMRM_PROXY_WAIT_UAM_REGISTER_DONE,
YarnConfiguration.DEFAULT_AMRM_PROXY_WAIT_UAM_REGISTER_DONE);
}
@ -701,7 +725,7 @@ public AllocateResponse allocate(AllocateRequest request)
if (this.finishAMCalled) {
LOG.warn("FinishApplicationMaster already called by {}, skip heartbeat "
+ "processing and return dummy response" + this.attemptId);
+ "processing and return dummy response.", this.attemptId);
return RECORD_FACTORY.newRecordInstance(AllocateResponse.class);
}
@ -1255,85 +1279,77 @@ private List<SubClusterId> registerAndAllocateWithNewSubClusters(
// Check to see if there are any new sub-clusters in this request
// list and create and register Unmanaged AM instance for the new ones
List<SubClusterId> newSubClusters = new ArrayList<>();
for (SubClusterId subClusterId : requests.keySet()) {
if (!subClusterId.equals(this.homeSubClusterId)
&& !this.uamPool.hasUAMId(subClusterId.getId())) {
newSubClusters.add(subClusterId);
requests.keySet().stream().forEach(subClusterId -> {
String id = subClusterId.getId();
if (!subClusterId.equals(this.homeSubClusterId) && !this.uamPool.hasUAMId(id)) {
newSubClusters.add(subClusterId);
// Set sub-cluster to be timed out initially
lastSCResponseTime.put(subClusterId,
clock.getTime() - subClusterTimeOut);
}
lastSCResponseTime.put(subClusterId, clock.getTime() - subClusterTimeOut);
}
});
this.uamRegisterFutures.clear();
for (final SubClusterId scId : newSubClusters) {
Future<?> future = this.threadpool.submit(new Runnable() {
@Override
public void run() {
Future<?> future = this.threadpool.submit(() -> {
String subClusterId = scId.getId();
// Create a config loaded with federation on and subclusterId
// for each UAM
YarnConfiguration config = new YarnConfiguration(getConf());
FederationProxyProviderUtil.updateConfForFederation(config,
subClusterId);
RegisterApplicationMasterResponse uamResponse = null;
Token<AMRMTokenIdentifier> token = null;
try {
FederationProxyProviderUtil.updateConfForFederation(config, subClusterId);
ApplicationId applicationId = attemptId.getApplicationId();
ApplicationSubmissionContext originalSubmissionContext =
federationFacade.getApplicationSubmissionContext(applicationId);
// For appNameSuffix, use subClusterId of the home sub-cluster
token = uamPool.launchUAM(subClusterId, config,
applicationId, amRegistrationResponse.getQueue(),
getApplicationContext().getUser(), homeSubClusterId.toString(),
true, subClusterId, originalSubmissionContext);
RegisterApplicationMasterResponse uamResponse;
Token<AMRMTokenIdentifier> token;
secondaryRelayers.put(subClusterId,
uamPool.getAMRMClientRelayer(subClusterId));
// LaunchUAM And RegisterApplicationMaster
try {
TokenAndRegisterResponse result =
((FederationActionRetry<TokenAndRegisterResponse>) (retryCount) ->
launchUAMAndRegisterApplicationMaster(config, subClusterId, applicationId)).
runWithRetries(registerUamRetryNum, registerUamRetryInterval);
uamResponse = uamPool.registerApplicationMaster(subClusterId,
amRegistrationRequest);
token = result.getToken();
uamResponse = result.getResponse();
} catch (Throwable e) {
LOG.error("Failed to register application master: " + subClusterId
+ " Application: " + attemptId, e);
// TODO: UAM registration for this sub-cluster RM
// failed. For now, we ignore the resource requests and continue
// but we need to fix this and handle this situation. One way would
// be to send the request to another RM by consulting the policy.
LOG.error("Failed to register application master: {} Application: {}.",
subClusterId, attemptId, e);
return;
}
uamRegistrations.put(scId, uamResponse);
LOG.info("Successfully registered unmanaged application master: "
+ subClusterId + " ApplicationId: " + attemptId);
uamRegistrations.put(scId, uamResponse);
LOG.info("Successfully registered unmanaged application master: {} " +
"ApplicationId: {}.", subClusterId, attemptId);
// Allocate Request
try {
uamPool.allocateAsync(subClusterId, requests.get(scId),
new HeartbeatCallBack(scId, true));
} catch (Throwable e) {
LOG.error("Failed to allocate async to " + subClusterId
+ " Application: " + attemptId, e);
LOG.error("Failed to allocate async to {} Application: {}.",
subClusterId, attemptId, e);
}
// Save the UAM token in registry or NMSS
try {
if (registryClient != null) {
registryClient.writeAMRMTokenForUAM(attemptId.getApplicationId(),
subClusterId, token);
registryClient.writeAMRMTokenForUAM(applicationId, subClusterId, token);
} else if (getNMStateStore() != null) {
getNMStateStore().storeAMRMProxyAppContextEntry(attemptId,
NMSS_SECONDARY_SC_PREFIX + subClusterId,
token.encodeToUrlString().getBytes(STRING_TO_BYTE_FORMAT));
}
} catch (Throwable e) {
LOG.error("Failed to persist UAM token from " + subClusterId
+ " Application: " + attemptId, e);
}
LOG.error("Failed to persist UAM token from {} Application {}",
subClusterId, attemptId, e);
}
});
this.uamRegisterFutures.put(scId, future);
}
@ -1347,10 +1363,34 @@ public void run() {
}
}
return newSubClusters;
}
protected TokenAndRegisterResponse launchUAMAndRegisterApplicationMaster(
YarnConfiguration config, String subClusterId, ApplicationId applicationId)
throws IOException, YarnException {
// Prepare parameter information
ApplicationSubmissionContext originalSubmissionContext =
federationFacade.getApplicationSubmissionContext(applicationId);
String submitter = getApplicationContext().getUser();
String homeRM = homeSubClusterId.toString();
String queue = amRegistrationResponse.getQueue();
// For appNameSuffix, use subClusterId of the home sub-cluster
Token<AMRMTokenIdentifier> token = uamPool.launchUAM(subClusterId, config, applicationId,
queue, submitter, homeRM, true, subClusterId, originalSubmissionContext);
// Set the relationship between SubCluster and AMRMClientRelayer.
secondaryRelayers.put(subClusterId, uamPool.getAMRMClientRelayer(subClusterId));
// RegisterApplicationMaster
RegisterApplicationMasterResponse uamResponse =
uamPool.registerApplicationMaster(subClusterId, amRegistrationRequest);
return new TokenAndRegisterResponse(token, uamResponse);
}
/**
* Prepare the base allocation response. Use lastSCResponse and
* lastHeartbeatTimeStamp to assemble entries about cluster-wide info, e.g.

View File

@ -0,0 +1,45 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
/**
* This class contains information about the AMRM token and the RegisterApplicationMasterResponse.
*/
public class TokenAndRegisterResponse {
private Token<AMRMTokenIdentifier> token;
private RegisterApplicationMasterResponse response;
public TokenAndRegisterResponse(Token<AMRMTokenIdentifier> pToken,
RegisterApplicationMasterResponse pResponse) {
this.token = pToken;
this.response = pResponse;
}
public Token<AMRMTokenIdentifier> getToken() {
return token;
}
public RegisterApplicationMasterResponse getResponse() {
return response;
}
}

View File

@ -38,7 +38,6 @@
import org.apache.hadoop.registry.client.api.RegistryOperations;
import org.apache.hadoop.registry.client.impl.FSRegistryOperationsService;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.test.LambdaTestUtils;
import org.apache.hadoop.util.Time;
import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
@ -179,9 +178,8 @@ protected YarnConfiguration createConfiguration() {
conf.setLong(YarnConfiguration.FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT,
500);
// Wait UAM Register Down
conf.setBoolean(YarnConfiguration.AMRM_PROXY_WAIT_UAM_REGISTER_DONE, true);
// Register UAM Retry Interval 1ms
conf.setLong(YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL, 1);
return conf;
}
@ -597,10 +595,6 @@ public Object run() throws Exception {
interceptor.recover(recoveredDataMap);
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
// Waiting for SC-1 to time out.
GenericTestUtils.waitFor(() -> interceptor.getTimedOutSCs(true).size() == 1, 100, 1000);
// SC1 should be initialized to be timed out
Assert.assertEquals(1, interceptor.getTimedOutSCs(true).size());
@ -859,7 +853,7 @@ public Object run() throws Exception {
List<Container> containers =
getContainersAndAssert(numberOfContainers, numberOfContainers * 2);
for (Container c : containers) {
LOG.info("Allocated container {}", c.getId());
LOG.info("Allocated container " + c.getId());
}
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
@ -893,10 +887,6 @@ public Object run() throws Exception {
int numberOfContainers = 3;
// Should re-attach secondaries and get the three running containers
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
// Waiting for SC-1 to time out.
GenericTestUtils.waitFor(() -> interceptor.getTimedOutSCs(true).size() == 1, 100, 1000);
// SC1 should be initialized to be timed out
Assert.assertEquals(1, interceptor.getTimedOutSCs(true).size());
Assert.assertEquals(numberOfContainers,
@ -1444,4 +1434,53 @@ private void finishApplication() throws IOException, YarnException {
Assert.assertNotNull(finishResponse);
Assert.assertTrue(finishResponse.getIsUnregistered());
}
@Test
public void testLaunchUAMAndRegisterApplicationMasterRetry() throws Exception {
UserGroupInformation ugi = interceptor.getUGIWithToken(interceptor.getAttemptId());
interceptor.setRetryCount(2);
ugi.doAs((PrivilegedExceptionAction<Object>) () -> {
// Register the application
RegisterApplicationMasterRequest registerReq =
Records.newRecord(RegisterApplicationMasterRequest.class);
registerReq.setHost(Integer.toString(testAppId));
registerReq.setRpcPort(0);
registerReq.setTrackingUrl("");
RegisterApplicationMasterResponse registerResponse =
interceptor.registerApplicationMaster(registerReq);
Assert.assertNotNull(registerResponse);
lastResponseId = 0;
Assert.assertEquals(0, interceptor.getUnmanagedAMPoolSize());
// Allocate the first batch of containers, with sc1 active
registerSubCluster(SubClusterId.newInstance("SC-1"));
int numberOfContainers = 3;
List<Container> containers = getContainersAndAssert(numberOfContainers, numberOfContainers);
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
// Release all containers
releaseContainersAndAssert(containers);
// Finish the application
FinishApplicationMasterRequest finishReq =
Records.newRecord(FinishApplicationMasterRequest.class);
finishReq.setDiagnostics("");
finishReq.setTrackingUrl("");
finishReq.setFinalApplicationStatus(FinalApplicationStatus.SUCCEEDED);
FinishApplicationMasterResponse finishResponse =
interceptor.finishApplicationMaster(finishReq);
Assert.assertNotNull(finishResponse);
Assert.assertTrue(finishResponse.getIsUnregistered());
return null;
});
Assert.assertEquals(0, interceptor.getRetryCount());
}
}

View File

@ -55,6 +55,7 @@ public class TestableFederationInterceptor extends FederationInterceptor {
private MockResourceManagerFacade mockRm;
private boolean isClientRPC = false;
private int retryCount = 0;
public TestableFederationInterceptor() {
}
@ -258,6 +259,24 @@ protected <T> T createRMProxy(Class<T> protocol, Configuration config,
}
}
@Override
protected TokenAndRegisterResponse launchUAMAndRegisterApplicationMaster(YarnConfiguration config,
String subClusterId, ApplicationId applicationId) throws IOException, YarnException {
if (retryCount > 0) {
retryCount--;
throw new YarnException("launchUAMAndRegisterApplicationMaster will retry");
}
return super.launchUAMAndRegisterApplicationMaster(config, subClusterId, applicationId);
}
public void setRetryCount(int retryCount) {
this.retryCount = retryCount;
}
public int getRetryCount() {
return retryCount;
}
/**
* Wrap the handler thread, so it calls from the same user.
*/