From ea68756c0cd3e6e20b8e7045a8b7edd180ea4708 Mon Sep 17 00:00:00 2001 From: Sunil G Date: Tue, 19 Nov 2019 14:10:41 +0530 Subject: [PATCH] YARN-8373. RM Received RMFatalEvent of type CRITICAL_THREAD_CRASH. Contributed by Wilfred Spiegelenburg. --- .../scheduler/ClusterNodeTracker.java | 18 +++++++++--------- .../scheduler/fair/FairScheduler.java | 15 +++++++++------ .../fair/TestContinuousScheduling.java | 2 -- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/ClusterNodeTracker.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/ClusterNodeTracker.java index f4491df7bf..50c45fcce4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/ClusterNodeTracker.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/ClusterNodeTracker.java @@ -33,12 +33,12 @@ import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeSet; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -386,21 +386,21 @@ public List getNodeIds(NodeFilter nodeFilter) { /** * Convenience method to sort nodes. + * Nodes can change while being sorted. Using a standard sort will fail + * without locking each node, the TreeSet handles this without locks. * - * Note that the sort is performed without holding a lock. We are sorting - * here instead of on the caller to allow for future optimizations (e.g. - * sort once every x milliseconds). + * @param comparator the comparator to sort the nodes with + * @return sorted set of nodes in the form of a TreeSet */ - public List sortedNodeList(Comparator comparator) { - List sortedList = null; + public TreeSet sortedNodeSet(Comparator comparator) { + TreeSet sortedSet = new TreeSet<>(comparator); readLock.lock(); try { - sortedList = new ArrayList(nodes.values()); + sortedSet.addAll(nodes.values()); } finally { readLock.unlock(); } - Collections.sort(sortedList, comparator); - return sortedList; + return sortedSet; } /** diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index 04bbe0ff68..958eddc58a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -113,6 +113,7 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; +import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; @@ -1001,15 +1002,17 @@ protected void nodeUpdate(RMNode nm) { @Deprecated void continuousSchedulingAttempt() throws InterruptedException { long start = getClock().getTime(); - List nodeIdList; - // Hold a lock to prevent comparator order changes due to changes of node - // unallocated resources - synchronized (this) { - nodeIdList = nodeTracker.sortedNodeList(nodeAvailableResourceComparator); + TreeSet nodeIdSet; + // Hold a lock to prevent node changes as much as possible. + readLock.lock(); + try { + nodeIdSet = nodeTracker.sortedNodeSet(nodeAvailableResourceComparator); + } finally { + readLock.unlock(); } // iterate all nodes - for (FSSchedulerNode node : nodeIdList) { + for (FSSchedulerNode node : nodeIdSet) { try { if (Resources.fitsIn(minimumAllocation, node.getUnallocatedResource())) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestContinuousScheduling.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestContinuousScheduling.java index 5dac862216..895962d609 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestContinuousScheduling.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestContinuousScheduling.java @@ -335,9 +335,7 @@ public void run() { for (int j = 0; j < 100; j++) { for (FSSchedulerNode node : clusterNodeTracker.getAllNodes()) { int i = ThreadLocalRandom.current().nextInt(-30, 30); - synchronized (scheduler) { node.deductUnallocatedResource(Resource.newInstance(i * 1024, i)); - } } } }