From dd4b387d96abc66ddebb569b3775b18b19aed027 Mon Sep 17 00:00:00 2001 From: rohithsharmaks Date: Wed, 24 Jun 2015 23:00:14 +0530 Subject: [PATCH] YARN-3790. usedResource from rootQueue metrics may get stale data for FS scheduler after recovering the container (Zhihai Xu via rohithsharmaks) --- hadoop-yarn-project/CHANGES.txt | 3 +++ .../resourcemanager/scheduler/fair/FairScheduler.java | 11 +++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index a5fc86ba9d..9547f0fa7a 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -705,6 +705,9 @@ Release 2.7.1 - UNRELEASED YARN-3832. Resource Localization fails on a cluster due to existing cache directories (Brahma Reddy Battula via jlowe) + YARN-3790. usedResource from rootQueue metrics may get stale data for FS + scheduler after recovering the container (Zhihai Xu via rohithsharmaks) + Release 2.7.0 - 2015-04-20 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index 2ed3b2a1bf..cbc10e7c1b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -45,6 +45,7 @@ import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes; +import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState; import org.apache.hadoop.yarn.server.resourcemanager.reservation.ReservationConstants; @@ -841,11 +842,11 @@ protected synchronized void completedContainer(RMContainer rmContainer, + " with event: " + event); } - private synchronized void addNode(RMNode node) { + private synchronized void addNode(List containerReports, + RMNode node) { FSSchedulerNode schedulerNode = new FSSchedulerNode(node, usePortForNodeName); nodes.put(node.getNodeID(), schedulerNode); Resources.addTo(clusterResource, node.getTotalCapability()); - updateRootQueueMetrics(); updateMaximumAllocation(schedulerNode, true); triggerUpdate(); @@ -854,6 +855,9 @@ private synchronized void addNode(RMNode node) { queueMgr.getRootQueue().recomputeSteadyShares(); LOG.info("Added node " + node.getNodeAddress() + " cluster capacity: " + clusterResource); + + recoverContainersOnNode(containerReports, node); + updateRootQueueMetrics(); } private synchronized void removeNode(RMNode rmNode) { @@ -1147,8 +1151,7 @@ public void handle(SchedulerEvent event) { throw new RuntimeException("Unexpected event type: " + event); } NodeAddedSchedulerEvent nodeAddedEvent = (NodeAddedSchedulerEvent)event; - addNode(nodeAddedEvent.getAddedRMNode()); - recoverContainersOnNode(nodeAddedEvent.getContainerReports(), + addNode(nodeAddedEvent.getContainerReports(), nodeAddedEvent.getAddedRMNode()); break; case NODE_REMOVED: