YARN-11509. The FederationInterceptor#launchUAM Added retry logic. (#5727)

This commit is contained in:
slfan1989 2023-07-12 09:47:07 +08:00 committed by GitHub
parent 33b1677e9e
commit 8b88e9f8f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 255 additions and 80 deletions

View File

@ -4058,6 +4058,20 @@ public static boolean isAclEnabled(Configuration conf) {
public static final long DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT = public static final long DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT =
60000; // one minute 60000; // one minute
// AMRMProxy Register UAM Retry-Num
public static final String FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT =
FEDERATION_PREFIX + "amrmproxy.register.uam.retry-count";
// Register a UAM , we will retry a maximum of 3 times.
public static final int DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT =
3;
// AMRMProxy Register UAM Retry Interval
public static final String FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL =
FEDERATION_PREFIX + "amrmproxy.register.uam.interval";
// Retry Interval, default 100 ms
public static final long DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL =
TimeUnit.MILLISECONDS.toMillis(100);
public static final String DEFAULT_FEDERATION_POLICY_KEY = "*"; public static final String DEFAULT_FEDERATION_POLICY_KEY = "*";
public static final String FEDERATION_POLICY_MANAGER = FEDERATION_PREFIX public static final String FEDERATION_POLICY_MANAGER = FEDERATION_PREFIX
+ "policy-manager"; + "policy-manager";

View File

@ -5408,4 +5408,22 @@
</description> </description>
</property> </property>
<property>
<description>
The number of retry for Register UAM.
The default value is 3.
</description>
<name>yarn.federation.amrmproxy.register.uam.retry-count</name>
<value>3</value>
</property>
<property>
<description>
Interval between retry for Register UAM.
The default value is 100ms.
</description>
<name>yarn.federation.amrmproxy.register.uam.interval</name>
<value>100ms</value>
</property>
</configuration> </configuration>

View File

@ -36,6 +36,7 @@
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.Future; import java.util.concurrent.Future;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
@ -87,6 +88,7 @@
import org.apache.hadoop.yarn.server.federation.policies.amrmproxy.FederationAMRMProxyPolicy; import org.apache.hadoop.yarn.server.federation.policies.amrmproxy.FederationAMRMProxyPolicy;
import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyInitializationException; import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyInitializationException;
import org.apache.hadoop.yarn.server.federation.resolver.SubClusterResolver; import org.apache.hadoop.yarn.server.federation.resolver.SubClusterResolver;
import org.apache.hadoop.yarn.server.federation.retry.FederationActionRetry;
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterId; import org.apache.hadoop.yarn.server.federation.store.records.SubClusterId;
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterInfo; import org.apache.hadoop.yarn.server.federation.store.records.SubClusterInfo;
import org.apache.hadoop.yarn.server.federation.utils.FederationRegistryClient; import org.apache.hadoop.yarn.server.federation.utils.FederationRegistryClient;
@ -251,6 +253,10 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
// the maximum wait time for the first async heart beat response // the maximum wait time for the first async heart beat response
private long heartbeatMaxWaitTimeMs; private long heartbeatMaxWaitTimeMs;
private int registerUamRetryNum;
private long registerUamRetryInterval;
private boolean waitUamRegisterDone; private boolean waitUamRegisterDone;
private MonotonicClock clock = new MonotonicClock(); private MonotonicClock clock = new MonotonicClock();
@ -355,6 +361,24 @@ public void init(AMRMProxyApplicationContext appContext) {
this.subClusterTimeOut = this.subClusterTimeOut =
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT; YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT;
} }
this.registerUamRetryNum = conf.getInt(
YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT,
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT);
if (this.registerUamRetryNum <= 0) {
LOG.info("{} configured to be {}, should be positive. Using default of {}.",
YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT,
this.subClusterTimeOut,
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT);
this.registerUamRetryNum =
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT;
}
this.registerUamRetryInterval = conf.getTimeDuration(
YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL,
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL,
TimeUnit.MILLISECONDS);
this.waitUamRegisterDone = conf.getBoolean(YarnConfiguration.AMRM_PROXY_WAIT_UAM_REGISTER_DONE, this.waitUamRegisterDone = conf.getBoolean(YarnConfiguration.AMRM_PROXY_WAIT_UAM_REGISTER_DONE,
YarnConfiguration.DEFAULT_AMRM_PROXY_WAIT_UAM_REGISTER_DONE); YarnConfiguration.DEFAULT_AMRM_PROXY_WAIT_UAM_REGISTER_DONE);
} }
@ -701,7 +725,7 @@ public AllocateResponse allocate(AllocateRequest request)
if (this.finishAMCalled) { if (this.finishAMCalled) {
LOG.warn("FinishApplicationMaster already called by {}, skip heartbeat " LOG.warn("FinishApplicationMaster already called by {}, skip heartbeat "
+ "processing and return dummy response" + this.attemptId); + "processing and return dummy response.", this.attemptId);
return RECORD_FACTORY.newRecordInstance(AllocateResponse.class); return RECORD_FACTORY.newRecordInstance(AllocateResponse.class);
} }
@ -1255,85 +1279,77 @@ private List<SubClusterId> registerAndAllocateWithNewSubClusters(
// Check to see if there are any new sub-clusters in this request // Check to see if there are any new sub-clusters in this request
// list and create and register Unmanaged AM instance for the new ones // list and create and register Unmanaged AM instance for the new ones
List<SubClusterId> newSubClusters = new ArrayList<>(); List<SubClusterId> newSubClusters = new ArrayList<>();
for (SubClusterId subClusterId : requests.keySet()) {
if (!subClusterId.equals(this.homeSubClusterId)
&& !this.uamPool.hasUAMId(subClusterId.getId())) {
newSubClusters.add(subClusterId);
requests.keySet().stream().forEach(subClusterId -> {
String id = subClusterId.getId();
if (!subClusterId.equals(this.homeSubClusterId) && !this.uamPool.hasUAMId(id)) {
newSubClusters.add(subClusterId);
// Set sub-cluster to be timed out initially // Set sub-cluster to be timed out initially
lastSCResponseTime.put(subClusterId, lastSCResponseTime.put(subClusterId, clock.getTime() - subClusterTimeOut);
clock.getTime() - subClusterTimeOut);
} }
} });
this.uamRegisterFutures.clear(); this.uamRegisterFutures.clear();
for (final SubClusterId scId : newSubClusters) { for (final SubClusterId scId : newSubClusters) {
Future<?> future = this.threadpool.submit(new Runnable() {
@Override
public void run() {
String subClusterId = scId.getId();
// Create a config loaded with federation on and subclusterId Future<?> future = this.threadpool.submit(() -> {
// for each UAM
YarnConfiguration config = new YarnConfiguration(getConf());
FederationProxyProviderUtil.updateConfForFederation(config,
subClusterId);
RegisterApplicationMasterResponse uamResponse = null; String subClusterId = scId.getId();
Token<AMRMTokenIdentifier> token = null;
try {
ApplicationId applicationId = attemptId.getApplicationId();
ApplicationSubmissionContext originalSubmissionContext =
federationFacade.getApplicationSubmissionContext(applicationId);
// For appNameSuffix, use subClusterId of the home sub-cluster // Create a config loaded with federation on and subclusterId
token = uamPool.launchUAM(subClusterId, config, // for each UAM
applicationId, amRegistrationResponse.getQueue(), YarnConfiguration config = new YarnConfiguration(getConf());
getApplicationContext().getUser(), homeSubClusterId.toString(), FederationProxyProviderUtil.updateConfForFederation(config, subClusterId);
true, subClusterId, originalSubmissionContext); ApplicationId applicationId = attemptId.getApplicationId();
secondaryRelayers.put(subClusterId, RegisterApplicationMasterResponse uamResponse;
uamPool.getAMRMClientRelayer(subClusterId)); Token<AMRMTokenIdentifier> token;
uamResponse = uamPool.registerApplicationMaster(subClusterId, // LaunchUAM And RegisterApplicationMaster
amRegistrationRequest); try {
} catch (Throwable e) { TokenAndRegisterResponse result =
LOG.error("Failed to register application master: " + subClusterId ((FederationActionRetry<TokenAndRegisterResponse>) (retryCount) ->
+ " Application: " + attemptId, e); launchUAMAndRegisterApplicationMaster(config, subClusterId, applicationId)).
// TODO: UAM registration for this sub-cluster RM runWithRetries(registerUamRetryNum, registerUamRetryInterval);
// failed. For now, we ignore the resource requests and continue
// but we need to fix this and handle this situation. One way would token = result.getToken();
// be to send the request to another RM by consulting the policy. uamResponse = result.getResponse();
return; } catch (Throwable e) {
} LOG.error("Failed to register application master: {} Application: {}.",
uamRegistrations.put(scId, uamResponse); subClusterId, attemptId, e);
LOG.info("Successfully registered unmanaged application master: " return;
+ subClusterId + " ApplicationId: " + attemptId); }
try { uamRegistrations.put(scId, uamResponse);
uamPool.allocateAsync(subClusterId, requests.get(scId),
new HeartbeatCallBack(scId, true)); LOG.info("Successfully registered unmanaged application master: {} " +
} catch (Throwable e) { "ApplicationId: {}.", subClusterId, attemptId);
LOG.error("Failed to allocate async to " + subClusterId
+ " Application: " + attemptId, e); // Allocate Request
} try {
uamPool.allocateAsync(subClusterId, requests.get(scId),
// Save the UAM token in registry or NMSS new HeartbeatCallBack(scId, true));
try { } catch (Throwable e) {
if (registryClient != null) { LOG.error("Failed to allocate async to {} Application: {}.",
registryClient.writeAMRMTokenForUAM(attemptId.getApplicationId(), subClusterId, attemptId, e);
subClusterId, token); }
} else if (getNMStateStore() != null) {
getNMStateStore().storeAMRMProxyAppContextEntry(attemptId, // Save the UAM token in registry or NMSS
NMSS_SECONDARY_SC_PREFIX + subClusterId, try {
token.encodeToUrlString().getBytes(STRING_TO_BYTE_FORMAT)); if (registryClient != null) {
} registryClient.writeAMRMTokenForUAM(applicationId, subClusterId, token);
} catch (Throwable e) { } else if (getNMStateStore() != null) {
LOG.error("Failed to persist UAM token from " + subClusterId getNMStateStore().storeAMRMProxyAppContextEntry(attemptId,
+ " Application: " + attemptId, e); NMSS_SECONDARY_SC_PREFIX + subClusterId,
token.encodeToUrlString().getBytes(STRING_TO_BYTE_FORMAT));
} }
} catch (Throwable e) {
LOG.error("Failed to persist UAM token from {} Application {}",
subClusterId, attemptId, e);
} }
}); });
this.uamRegisterFutures.put(scId, future); this.uamRegisterFutures.put(scId, future);
} }
@ -1347,10 +1363,34 @@ public void run() {
} }
} }
return newSubClusters; return newSubClusters;
} }
protected TokenAndRegisterResponse launchUAMAndRegisterApplicationMaster(
YarnConfiguration config, String subClusterId, ApplicationId applicationId)
throws IOException, YarnException {
// Prepare parameter information
ApplicationSubmissionContext originalSubmissionContext =
federationFacade.getApplicationSubmissionContext(applicationId);
String submitter = getApplicationContext().getUser();
String homeRM = homeSubClusterId.toString();
String queue = amRegistrationResponse.getQueue();
// For appNameSuffix, use subClusterId of the home sub-cluster
Token<AMRMTokenIdentifier> token = uamPool.launchUAM(subClusterId, config, applicationId,
queue, submitter, homeRM, true, subClusterId, originalSubmissionContext);
// Set the relationship between SubCluster and AMRMClientRelayer.
secondaryRelayers.put(subClusterId, uamPool.getAMRMClientRelayer(subClusterId));
// RegisterApplicationMaster
RegisterApplicationMasterResponse uamResponse =
uamPool.registerApplicationMaster(subClusterId, amRegistrationRequest);
return new TokenAndRegisterResponse(token, uamResponse);
}
/** /**
* Prepare the base allocation response. Use lastSCResponse and * Prepare the base allocation response. Use lastSCResponse and
* lastHeartbeatTimeStamp to assemble entries about cluster-wide info, e.g. * lastHeartbeatTimeStamp to assemble entries about cluster-wide info, e.g.

View File

@ -0,0 +1,45 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
/**
* This class contains information about the AMRM token and the RegisterApplicationMasterResponse.
*/
public class TokenAndRegisterResponse {
private Token<AMRMTokenIdentifier> token;
private RegisterApplicationMasterResponse response;
public TokenAndRegisterResponse(Token<AMRMTokenIdentifier> pToken,
RegisterApplicationMasterResponse pResponse) {
this.token = pToken;
this.response = pResponse;
}
public Token<AMRMTokenIdentifier> getToken() {
return token;
}
public RegisterApplicationMasterResponse getResponse() {
return response;
}
}

View File

@ -38,7 +38,6 @@
import org.apache.hadoop.registry.client.api.RegistryOperations; import org.apache.hadoop.registry.client.api.RegistryOperations;
import org.apache.hadoop.registry.client.impl.FSRegistryOperationsService; import org.apache.hadoop.registry.client.impl.FSRegistryOperationsService;
import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.test.LambdaTestUtils; import org.apache.hadoop.test.LambdaTestUtils;
import org.apache.hadoop.util.Time; import org.apache.hadoop.util.Time;
import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest; import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
@ -179,9 +178,8 @@ protected YarnConfiguration createConfiguration() {
conf.setLong(YarnConfiguration.FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT, conf.setLong(YarnConfiguration.FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT,
500); 500);
// Wait UAM Register Down // Register UAM Retry Interval 1ms
conf.setBoolean(YarnConfiguration.AMRM_PROXY_WAIT_UAM_REGISTER_DONE, true); conf.setLong(YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL, 1);
return conf; return conf;
} }
@ -597,10 +595,6 @@ public Object run() throws Exception {
interceptor.recover(recoveredDataMap); interceptor.recover(recoveredDataMap);
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize()); Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
// Waiting for SC-1 to time out.
GenericTestUtils.waitFor(() -> interceptor.getTimedOutSCs(true).size() == 1, 100, 1000);
// SC1 should be initialized to be timed out // SC1 should be initialized to be timed out
Assert.assertEquals(1, interceptor.getTimedOutSCs(true).size()); Assert.assertEquals(1, interceptor.getTimedOutSCs(true).size());
@ -859,7 +853,7 @@ public Object run() throws Exception {
List<Container> containers = List<Container> containers =
getContainersAndAssert(numberOfContainers, numberOfContainers * 2); getContainersAndAssert(numberOfContainers, numberOfContainers * 2);
for (Container c : containers) { for (Container c : containers) {
LOG.info("Allocated container {}", c.getId()); LOG.info("Allocated container " + c.getId());
} }
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize()); Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
@ -893,10 +887,6 @@ public Object run() throws Exception {
int numberOfContainers = 3; int numberOfContainers = 3;
// Should re-attach secondaries and get the three running containers // Should re-attach secondaries and get the three running containers
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize()); Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
// Waiting for SC-1 to time out.
GenericTestUtils.waitFor(() -> interceptor.getTimedOutSCs(true).size() == 1, 100, 1000);
// SC1 should be initialized to be timed out // SC1 should be initialized to be timed out
Assert.assertEquals(1, interceptor.getTimedOutSCs(true).size()); Assert.assertEquals(1, interceptor.getTimedOutSCs(true).size());
Assert.assertEquals(numberOfContainers, Assert.assertEquals(numberOfContainers,
@ -1444,4 +1434,53 @@ private void finishApplication() throws IOException, YarnException {
Assert.assertNotNull(finishResponse); Assert.assertNotNull(finishResponse);
Assert.assertTrue(finishResponse.getIsUnregistered()); Assert.assertTrue(finishResponse.getIsUnregistered());
} }
@Test
public void testLaunchUAMAndRegisterApplicationMasterRetry() throws Exception {
UserGroupInformation ugi = interceptor.getUGIWithToken(interceptor.getAttemptId());
interceptor.setRetryCount(2);
ugi.doAs((PrivilegedExceptionAction<Object>) () -> {
// Register the application
RegisterApplicationMasterRequest registerReq =
Records.newRecord(RegisterApplicationMasterRequest.class);
registerReq.setHost(Integer.toString(testAppId));
registerReq.setRpcPort(0);
registerReq.setTrackingUrl("");
RegisterApplicationMasterResponse registerResponse =
interceptor.registerApplicationMaster(registerReq);
Assert.assertNotNull(registerResponse);
lastResponseId = 0;
Assert.assertEquals(0, interceptor.getUnmanagedAMPoolSize());
// Allocate the first batch of containers, with sc1 active
registerSubCluster(SubClusterId.newInstance("SC-1"));
int numberOfContainers = 3;
List<Container> containers = getContainersAndAssert(numberOfContainers, numberOfContainers);
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
// Release all containers
releaseContainersAndAssert(containers);
// Finish the application
FinishApplicationMasterRequest finishReq =
Records.newRecord(FinishApplicationMasterRequest.class);
finishReq.setDiagnostics("");
finishReq.setTrackingUrl("");
finishReq.setFinalApplicationStatus(FinalApplicationStatus.SUCCEEDED);
FinishApplicationMasterResponse finishResponse =
interceptor.finishApplicationMaster(finishReq);
Assert.assertNotNull(finishResponse);
Assert.assertTrue(finishResponse.getIsUnregistered());
return null;
});
Assert.assertEquals(0, interceptor.getRetryCount());
}
} }

View File

@ -55,6 +55,7 @@ public class TestableFederationInterceptor extends FederationInterceptor {
private MockResourceManagerFacade mockRm; private MockResourceManagerFacade mockRm;
private boolean isClientRPC = false; private boolean isClientRPC = false;
private int retryCount = 0;
public TestableFederationInterceptor() { public TestableFederationInterceptor() {
} }
@ -258,6 +259,24 @@ protected <T> T createRMProxy(Class<T> protocol, Configuration config,
} }
} }
@Override
protected TokenAndRegisterResponse launchUAMAndRegisterApplicationMaster(YarnConfiguration config,
String subClusterId, ApplicationId applicationId) throws IOException, YarnException {
if (retryCount > 0) {
retryCount--;
throw new YarnException("launchUAMAndRegisterApplicationMaster will retry");
}
return super.launchUAMAndRegisterApplicationMaster(config, subClusterId, applicationId);
}
public void setRetryCount(int retryCount) {
this.retryCount = retryCount;
}
public int getRetryCount() {
return retryCount;
}
/** /**
* Wrap the handler thread, so it calls from the same user. * Wrap the handler thread, so it calls from the same user.
*/ */