From 497c86b485b1bb8a2eba52308646d8e1ee76bce3 Mon Sep 17 00:00:00 2001 From: Jian He Date: Sat, 18 Apr 2015 12:45:38 -0700 Subject: [PATCH] YARN-3136. Fixed a synchronization problem of AbstractYarnScheduler#getTransferredContainers. Contributed by Sunil G --- hadoop-yarn-project/CHANGES.txt | 3 ++ .../dev-support/findbugs-exclude.xml | 8 ++++ .../ApplicationMasterService.java | 47 ++++++++++--------- .../scheduler/AbstractYarnScheduler.java | 17 ++++++- 4 files changed, 51 insertions(+), 24 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index dfedc8a88f..71fde68115 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -227,6 +227,9 @@ Release 2.8.0 - UNRELEASED YARN-3493. RM fails to come up with error "Failed to load/recover state" when mem settings are changed. (Jian He via wangda) + YARN-3136. Fixed a synchronization problem of + AbstractYarnScheduler#getTransferredContainers. (Sunil G via jianhe) + Release 2.7.1 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml index 8aae152f93..ece8548456 100644 --- a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml +++ b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml @@ -469,6 +469,14 @@ + + + + + + + + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java index 4ad8e879a1..7244b17f02 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java @@ -298,32 +298,35 @@ public RegisterApplicationMasterResponse registerApplicationMaster( // For work-preserving AM restart, retrieve previous attempts' containers // and corresponding NM tokens. - List transferredContainers = - ((AbstractYarnScheduler) rScheduler) + if (app.getApplicationSubmissionContext() + .getKeepContainersAcrossApplicationAttempts()) { + List transferredContainers = ((AbstractYarnScheduler) rScheduler) .getTransferredContainers(applicationAttemptId); - if (!transferredContainers.isEmpty()) { - response.setContainersFromPreviousAttempts(transferredContainers); - List nmTokens = new ArrayList(); - for (Container container : transferredContainers) { - try { - NMToken token = rmContext.getNMTokenSecretManager() - .createAndGetNMToken(app.getUser(), applicationAttemptId, - container); - if (null != token) { - nmTokens.add(token); - } - } catch (IllegalArgumentException e) { - // if it's a DNS issue, throw UnknowHostException directly and that - // will be automatically retried by RMProxy in RPC layer. - if (e.getCause() instanceof UnknownHostException) { - throw (UnknownHostException) e.getCause(); + if (!transferredContainers.isEmpty()) { + response.setContainersFromPreviousAttempts(transferredContainers); + List nmTokens = new ArrayList(); + for (Container container : transferredContainers) { + try { + NMToken token = rmContext.getNMTokenSecretManager() + .createAndGetNMToken(app.getUser(), applicationAttemptId, + container); + if (null != token) { + nmTokens.add(token); + } + } catch (IllegalArgumentException e) { + // if it's a DNS issue, throw UnknowHostException directly and + // that + // will be automatically retried by RMProxy in RPC layer. + if (e.getCause() instanceof UnknownHostException) { + throw (UnknownHostException) e.getCause(); + } } } + response.setNMTokensFromPreviousAttempts(nmTokens); + LOG.info("Application " + appID + " retrieved " + + transferredContainers.size() + " containers from previous" + + " attempts and " + nmTokens.size() + " NM tokens."); } - response.setNMTokensFromPreviousAttempts(nmTokens); - LOG.info("Application " + appID + " retrieved " - + transferredContainers.size() + " containers from previous" - + " attempts and " + nmTokens.size() + " NM tokens."); } response.setSchedulerResourceTypes(rScheduler diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java index 1a8c653cc5..ae927f1139 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java @@ -21,12 +21,15 @@ import java.io.IOException; import java.util.*; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.classification.InterfaceAudience.Private; +import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; @@ -67,6 +70,8 @@ @SuppressWarnings("unchecked") +@Private +@Unstable public abstract class AbstractYarnScheduler extends AbstractService implements ResourceScheduler { @@ -91,7 +96,12 @@ public abstract class AbstractYarnScheduler private long configuredMaximumAllocationWaitTime; protected RMContext rmContext; - protected Map> applications; + + /* + * All schedulers which are inheriting AbstractYarnScheduler should use + * concurrent version of 'applications' map. + */ + protected ConcurrentMap> applications; protected int nmExpireInterval; protected final static List EMPTY_CONTAINER_LIST = @@ -123,7 +133,7 @@ public void serviceInit(Configuration conf) throws Exception { super.serviceInit(conf); } - public synchronized List getTransferredContainers( + public List getTransferredContainers( ApplicationAttemptId currentAttempt) { ApplicationId appId = currentAttempt.getApplicationId(); SchedulerApplication app = applications.get(appId); @@ -132,6 +142,9 @@ public synchronized List getTransferredContainers( if (appImpl.getApplicationSubmissionContext().getUnmanagedAM()) { return containerList; } + if (app == null) { + return containerList; + } Collection liveContainers = app.getCurrentAppAttempt().getLiveContainers(); ContainerId amContainerId =