From e8dc862d3856e9eaea124c625dade36f1dd53fe2 Mon Sep 17 00:00:00 2001 From: Eric Yang Date: Tue, 30 Jun 2020 11:39:16 -0700 Subject: [PATCH] YARN-9809. Added node manager health status to resource manager registration call. Contributed by Eric Badger via eyang --- .../hadoop/yarn/conf/YarnConfiguration.java | 7 ++ .../src/main/resources/yarn-default.xml | 7 ++ .../RegisterNodeManagerRequest.java | 19 +++++- .../pb/RegisterNodeManagerRequestPBImpl.java | 39 ++++++++++- .../yarn_server_common_service_protos.proto | 1 + .../nodemanager/NodeStatusUpdaterImpl.java | 3 +- .../health/NodeHealthScriptRunner.java | 11 ++- .../health/TimedHealthReporterService.java | 20 +++++- .../server/nodemanager/TestEventFlow.java | 5 ++ .../BaseContainerManagerTest.java | 68 +++++++++++-------- .../TestContainerManager.java | 6 +- .../containermanager/TestNMProxy.java | 4 +- .../TestContainerSchedulerQueuing.java | 2 +- .../ResourceTrackerService.java | 5 +- .../resourcemanager/rmnode/RMNodeImpl.java | 58 +++++++++++++--- .../rmnode/RMNodeStartedEvent.java | 10 ++- .../yarn/server/resourcemanager/MockNM.java | 22 ++++++ .../yarn/server/resourcemanager/MockRM.java | 7 +- .../server/resourcemanager/NodeManager.java | 3 +- .../TestRMNodeTransitions.java | 55 ++++++++++++--- .../resourcemanager/TestResourceManager.java | 29 +++++--- .../TestResourceTrackerService.java | 6 ++ .../TestRMAppLogAggregationStatus.java | 7 +- .../resourcetracker/TestNMExpiry.java | 7 ++ .../resourcetracker/TestNMReconnect.java | 7 ++ .../scheduler/TestAbstractYarnScheduler.java | 5 ++ .../scheduler/TestSchedulerHealth.java | 18 +++-- .../capacity/TestCapacityScheduler.java | 63 +++++++++++------ .../scheduler/fair/TestFairScheduler.java | 21 ++++-- .../scheduler/fifo/TestFifoScheduler.java | 25 +++++-- .../webapp/TestRMWebServicesNodes.java | 5 +- 31 files changed, 430 insertions(+), 115 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 85d5a58036..54e8888f0d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -2013,6 +2013,13 @@ public class YarnConfiguration extends Configuration { NM_PREFIX + "health-checker.interval-ms"; public static final long DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS = 10 * 60 * 1000; + /** Whether or not to run the node health script before the NM + * starts up.*/ + public static final String NM_HEALTH_CHECK_RUN_BEFORE_STARTUP = + NM_PREFIX + "health-checker.run-before-startup"; + public static final boolean DEFAULT_NM_HEALTH_CHECK_RUN_BEFORE_STARTUP = + false; + /** Health check time out period for all scripts.*/ public static final String NM_HEALTH_CHECK_TIMEOUT_MS = NM_PREFIX + "health-checker.timeout-ms"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index f09186ecf4..2f97a7cce7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -1668,6 +1668,13 @@ 1200000 + + Whether or not to run the node health script + before the NM starts up. + yarn.nodemanager.health-checker.run-before-startup + false + + Frequency of running node health scripts. yarn.nodemanager.health-checker.interval-ms diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerRequest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerRequest.java index acec16fd56..54b39155c6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerRequest.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerRequest.java @@ -26,6 +26,7 @@ import org.apache.hadoop.yarn.api.records.NodeAttribute; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.NodeLabel; import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.util.Records; public abstract class RegisterNodeManagerRequest { @@ -53,14 +54,15 @@ public abstract class RegisterNodeManagerRequest { Resource physicalResource) { return newInstance(nodeId, httpPort, resource, nodeManagerVersionId, containerStatuses, runningApplications, nodeLabels, physicalResource, - null); + null, null); } public static RegisterNodeManagerRequest newInstance(NodeId nodeId, int httpPort, Resource resource, String nodeManagerVersionId, List containerStatuses, List runningApplications, Set nodeLabels, - Resource physicalResource, Set nodeAttributes) { + Resource physicalResource, Set nodeAttributes, + NodeStatus nodeStatus) { RegisterNodeManagerRequest request = Records.newRecord(RegisterNodeManagerRequest.class); request.setHttpPort(httpPort); @@ -72,6 +74,7 @@ public abstract class RegisterNodeManagerRequest { request.setNodeLabels(nodeLabels); request.setPhysicalResource(physicalResource); request.setNodeAttributes(nodeAttributes); + request.setNodeStatus(nodeStatus); return request; } @@ -133,4 +136,16 @@ public abstract class RegisterNodeManagerRequest { public abstract Set getNodeAttributes(); public abstract void setNodeAttributes(Set nodeAttributes); + + /** + * Get the status of the node. + * @return The status of the node. + */ + public abstract NodeStatus getNodeStatus(); + + /** + * Set the status of the node. + * @param nodeStatus The status of the node. + */ + public abstract void setNodeStatus(NodeStatus nodeStatus); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerRequestPBImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerRequestPBImpl.java index 317f8abd6f..d91cff2531 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerRequestPBImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerRequestPBImpl.java @@ -41,6 +41,7 @@ import org.apache.hadoop.yarn.proto.YarnProtos.NodeIdProto; import org.apache.hadoop.yarn.proto.YarnProtos.NodeLabelProto; import org.apache.hadoop.yarn.proto.YarnProtos.NodeAttributeProto; import org.apache.hadoop.yarn.proto.YarnProtos.ResourceProto; +import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.NodeStatusProto; import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.LogAggregationReportProto; import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NMContainerStatusProto; import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeLabelsProto; @@ -51,7 +52,9 @@ import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.RegisterNodeMa import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest; - +import org.apache.hadoop.yarn.server.api.records.NodeStatus; +import org.apache.hadoop.yarn.server.api.records.impl.pb.NodeStatusPBImpl; + public class RegisterNodeManagerRequestPBImpl extends RegisterNodeManagerRequest { RegisterNodeManagerRequestProto proto = RegisterNodeManagerRequestProto.getDefaultInstance(); RegisterNodeManagerRequestProto.Builder builder = null; @@ -68,6 +71,7 @@ public class RegisterNodeManagerRequestPBImpl extends RegisterNodeManagerRequest /** Physical resources in the node. */ private Resource physicalResource = null; + private NodeStatus nodeStatus; public RegisterNodeManagerRequestPBImpl() { builder = RegisterNodeManagerRequestProto.newBuilder(); @@ -121,6 +125,9 @@ public class RegisterNodeManagerRequestPBImpl extends RegisterNodeManagerRequest if (this.logAggregationReportsForApps != null) { addLogAggregationStatusForAppsToProto(); } + if (this.nodeStatus != null) { + builder.setNodeStatus(convertToProtoFormat(this.nodeStatus)); + } } private void addLogAggregationStatusForAppsToProto() { @@ -359,6 +366,28 @@ public class RegisterNodeManagerRequestPBImpl extends RegisterNodeManagerRequest this.physicalResource = pPhysicalResource; } + @Override + public synchronized NodeStatus getNodeStatus() { + RegisterNodeManagerRequestProtoOrBuilder p = viaProto ? proto : builder; + if (this.nodeStatus != null) { + return this.nodeStatus; + } + if (!p.hasNodeStatus()) { + return null; + } + this.nodeStatus = convertFromProtoFormat(p.getNodeStatus()); + return this.nodeStatus; + } + + @Override + public synchronized void setNodeStatus(NodeStatus pNodeStatus) { + maybeInitBuilder(); + if (pNodeStatus == null) { + builder.clearNodeStatus(); + } + this.nodeStatus = pNodeStatus; + } + @Override public int hashCode() { return getProto().hashCode(); @@ -533,4 +562,12 @@ public class RegisterNodeManagerRequestPBImpl extends RegisterNodeManagerRequest } this.logAggregationReportsForApps = logAggregationStatusForApps; } + + private NodeStatusPBImpl convertFromProtoFormat(NodeStatusProto s) { + return new NodeStatusPBImpl(s); + } + + private NodeStatusProto convertToProtoFormat(NodeStatus s) { + return ((NodeStatusPBImpl)s).getProto(); + } } \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto index ff7153eca8..c643179888 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto @@ -74,6 +74,7 @@ message RegisterNodeManagerRequestProto { optional ResourceProto physicalResource = 9; repeated LogAggregationReportProto log_aggregation_reports_for_apps = 10; optional NodeAttributesProto nodeAttributes = 11; + optional NodeStatusProto nodeStatus = 12; } message RegisterNodeManagerResponseProto { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java index 5e3693ae9c..0725d42309 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java @@ -392,10 +392,11 @@ public class NodeStatusUpdaterImpl extends AbstractService implements // during RM recovery synchronized (this.context) { List containerReports = getNMContainerStatuses(); + NodeStatus nodeStatus = getNodeStatus(0); RegisterNodeManagerRequest request = RegisterNodeManagerRequest.newInstance(nodeId, httpPort, totalResource, nodeManagerVersionId, containerReports, getRunningApplications(), - nodeLabels, physicalResource, nodeAttributes); + nodeLabels, physicalResource, nodeAttributes, nodeStatus); if (containerReports != null) { LOG.info("Registering with RM using containers :" + containerReports); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthScriptRunner.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthScriptRunner.java index 1c9bd82bd4..af92b15e9c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthScriptRunner.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/NodeHealthScriptRunner.java @@ -60,8 +60,9 @@ public class NodeHealthScriptRunner extends TimedHealthReporterService { "Node health script timed out"; private NodeHealthScriptRunner(String scriptName, long checkInterval, - long timeout, String[] scriptArgs) { - super(NodeHealthScriptRunner.class.getName(), checkInterval); + long timeout, String[] scriptArgs, boolean runBeforeStartup) { + super(NodeHealthScriptRunner.class.getName(), checkInterval, + runBeforeStartup); this.nodeHealthScript = scriptName; this.scriptTimeout = timeout; setTimerTask(new NodeHealthMonitorExecutor(scriptArgs)); @@ -91,6 +92,10 @@ public class NodeHealthScriptRunner extends TimedHealthReporterService { "interval-ms can not be set to a negative number."); } + boolean runBeforeStartup = conf.getBoolean( + YarnConfiguration.NM_HEALTH_CHECK_RUN_BEFORE_STARTUP, + YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_RUN_BEFORE_STARTUP); + // Determine time out String scriptTimeoutConfig = String.format( YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS_TEMPLATE, @@ -113,7 +118,7 @@ public class NodeHealthScriptRunner extends TimedHealthReporterService { String[] scriptArgs = conf.getStrings(scriptArgsConfig, new String[]{}); return new NodeHealthScriptRunner(nodeHealthScript, - checkIntervalMs, scriptTimeout, scriptArgs); + checkIntervalMs, scriptTimeout, scriptArgs, runBeforeStartup); } private enum HealthCheckerExitStatus { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java index a0c4d8b8eb..6a7a2911d2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/health/TimedHealthReporterService.java @@ -45,6 +45,7 @@ public abstract class TimedHealthReporterService extends AbstractService private Timer timer; private TimerTask task; private long intervalMs; + private boolean runBeforeStartup; TimedHealthReporterService(String name, long intervalMs) { super(name); @@ -52,6 +53,17 @@ public abstract class TimedHealthReporterService extends AbstractService this.healthReport = ""; this.lastReportedTime = System.currentTimeMillis(); this.intervalMs = intervalMs; + this.runBeforeStartup = false; + } + + TimedHealthReporterService(String name, long intervalMs, + boolean runBeforeStartup) { + super(name); + this.isHealthy = true; + this.healthReport = ""; + this.lastReportedTime = System.currentTimeMillis(); + this.intervalMs = intervalMs; + this.runBeforeStartup = runBeforeStartup; } @VisibleForTesting @@ -73,7 +85,13 @@ public abstract class TimedHealthReporterService extends AbstractService throw new Exception("Health reporting task hasn't been set!"); } timer = new Timer("HealthReporterService-Timer", true); - timer.scheduleAtFixedRate(task, 0, intervalMs); + long delay = 0; + if (runBeforeStartup) { + delay = intervalMs; + task.run(); + } + + timer.scheduleAtFixedRate(task, delay, intervalMs); super.serviceStart(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java index b1fc2f1aa2..3f4879b23e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestEventFlow.java @@ -18,6 +18,8 @@ package org.apache.hadoop.yarn.server.nodemanager; +import static org.mockito.Mockito.mock; + import java.io.File; import java.io.IOException; import java.util.ArrayList; @@ -134,6 +136,9 @@ public class TestEventFlow { new DummyContainerManager(context, exec, del, nodeStatusUpdater, metrics, dirsHandler); nodeStatusUpdater.init(conf); + NodeResourceMonitorImpl nodeResourceMonitor = mock( + NodeResourceMonitorImpl.class); + ((NMContext) context).setNodeResourceMonitor(nodeResourceMonitor); ((NMContext)context).setContainerManager(containerManager); nodeStatusUpdater.start(); ((NMContext)context).setNodeStatusUpdater(nodeStatusUpdater); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java index 7a85bfab44..9ee3ce6bc8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java @@ -26,6 +26,7 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.spy; import static org.mockito.Mockito.doNothing; +import org.apache.hadoop.yarn.server.nodemanager.NodeResourceMonitorImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -156,33 +157,21 @@ public abstract class BaseContainerManagerTest { protected NodeHealthCheckerService nodeHealthChecker; protected LocalDirsHandlerService dirsHandler; protected final long DUMMY_RM_IDENTIFIER = 1234; - - protected NodeStatusUpdater nodeStatusUpdater = new NodeStatusUpdaterImpl( - context, new AsyncDispatcher(), null, metrics) { - @Override - protected ResourceTracker getRMClient() { - return new LocalRMInterface(); - }; - - @Override - protected void stopRMProxy() { - return; - } - - @Override - protected void startStatusUpdater() { - return; // Don't start any updating thread. - } - - @Override - public long getRMIdentifier() { - // There is no real RM registration, simulate and set RMIdentifier - return DUMMY_RM_IDENTIFIER; - } - }; - + private NodeResourceMonitorImpl nodeResourceMonitor = mock( + NodeResourceMonitorImpl.class); + private NodeHealthCheckerService nodeHealthCheckerService; + private NodeStatusUpdater nodeStatusUpdater; protected ContainerManagerImpl containerManager = null; + public NodeStatusUpdater getNodeStatusUpdater() { + return nodeStatusUpdater; + } + + public void setNodeStatusUpdater( + NodeStatusUpdater nodeStatusUpdater) { + this.nodeStatusUpdater = nodeStatusUpdater; + } + protected ContainerExecutor createContainerExecutor() { DefaultContainerExecutor exec = new DefaultContainerExecutor(); exec.setConf(conf); @@ -218,11 +207,36 @@ public abstract class BaseContainerManagerTest { delSrvc.init(conf); dirsHandler = new LocalDirsHandlerService(); - nodeHealthChecker = new NodeHealthCheckerService(dirsHandler); - nodeHealthChecker.init(conf); + dirsHandler.init(conf); + nodeHealthCheckerService = new NodeHealthCheckerService(dirsHandler); + nodeStatusUpdater = new NodeStatusUpdaterImpl( + context, new AsyncDispatcher(), nodeHealthCheckerService, metrics) { + @Override + protected ResourceTracker getRMClient() { + return new LocalRMInterface(); + }; + + @Override + protected void stopRMProxy() { + return; + } + + @Override + protected void startStatusUpdater() { + return; // Don't start any updating thread. + } + + @Override + public long getRMIdentifier() { + // There is no real RM registration, simulate and set RMIdentifier + return DUMMY_RM_IDENTIFIER; + } + }; + containerManager = createContainerManager(delSrvc); ((NMContext)context).setContainerManager(containerManager); ((NMContext)context).setContainerExecutor(exec); + ((NMContext)context).setNodeResourceMonitor(nodeResourceMonitor); nodeStatusUpdater.init(conf); containerManager.init(conf); nodeStatusUpdater.start(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java index e215980882..4e63417bba 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java @@ -193,8 +193,8 @@ public class TestContainerManager extends BaseContainerManagerTest { @Override protected ContainerManagerImpl createContainerManager(DeletionService delSrvc) { - return new ContainerManagerImpl(context, exec, delSrvc, nodeStatusUpdater, - metrics, dirsHandler) { + return new ContainerManagerImpl(context, exec, delSrvc, + getNodeStatusUpdater(), metrics, dirsHandler) { @Override protected UserGroupInformation getRemoteUgi() throws YarnException { @@ -1704,7 +1704,7 @@ public class TestContainerManager extends BaseContainerManagerTest { @Test public void testNullTokens() throws Exception { ContainerManagerImpl cMgrImpl = - new ContainerManagerImpl(context, exec, delSrvc, nodeStatusUpdater, + new ContainerManagerImpl(context, exec, delSrvc, getNodeStatusUpdater(), metrics, dirsHandler); String strExceptionMsg = ""; try { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestNMProxy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestNMProxy.java index 5f023f02df..32ff5724c3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestNMProxy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestNMProxy.java @@ -65,8 +65,8 @@ public class TestNMProxy extends BaseContainerManagerTest { @Override protected ContainerManagerImpl createContainerManager(DeletionService delSrvc) { - return new ContainerManagerImpl(context, exec, delSrvc, nodeStatusUpdater, - metrics, dirsHandler) { + return new ContainerManagerImpl(context, exec, delSrvc, + getNodeStatusUpdater(), metrics, dirsHandler) { @Override public StartContainersResponse startContainers( diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/TestContainerSchedulerQueuing.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/TestContainerSchedulerQueuing.java index b21850cbcf..508b8bd091 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/TestContainerSchedulerQueuing.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/TestContainerSchedulerQueuing.java @@ -131,7 +131,7 @@ public class TestContainerSchedulerQueuing extends BaseContainerManagerTest { protected ContainerManagerImpl createContainerManager( DeletionService delSrvc) { return new ContainerManagerImpl(context, exec, delSrvc, - nodeStatusUpdater, metrics, dirsHandler) { + getNodeStatusUpdater(), metrics, dirsHandler) { @Override protected UserGroupInformation getRemoteUgi() throws YarnException { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java index 2c89ddd9e9..7d6feea6f1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java @@ -335,6 +335,7 @@ public class ResourceTrackerService extends AbstractService implements Resource capability = request.getResource(); String nodeManagerVersion = request.getNMVersion(); Resource physicalResource = request.getPhysicalResource(); + NodeStatus nodeStatus = request.getNodeStatus(); RegisterNodeManagerResponse response = recordFactory .newRecordInstance(RegisterNodeManagerResponse.class); @@ -426,7 +427,7 @@ public class ResourceTrackerService extends AbstractService implements if (oldNode == null) { RMNodeStartedEvent startEvent = new RMNodeStartedEvent(nodeId, request.getNMContainerStatuses(), - request.getRunningApplications()); + request.getRunningApplications(), nodeStatus); if (request.getLogAggregationReportsForApps() != null && !request.getLogAggregationReportsForApps().isEmpty()) { if (LOG.isDebugEnabled()) { @@ -462,7 +463,7 @@ public class ResourceTrackerService extends AbstractService implements this.rmContext.getRMNodes().put(nodeId, rmNode); this.rmContext.getDispatcher().getEventHandler() - .handle(new RMNodeStartedEvent(nodeId, null, null)); + .handle(new RMNodeStartedEvent(nodeId, null, null, nodeStatus)); } else { // Reset heartbeat ID since node just restarted. oldNode.resetLastNodeHeartBeatResponse(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java index a565fe7565..68f44dc6d5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java @@ -36,6 +36,7 @@ import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; import org.apache.commons.collections.keyvalue.DefaultMapEntry; +import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.classification.InterfaceAudience.Private; @@ -208,7 +209,8 @@ public class RMNodeImpl implements RMNode, EventHandler { RMNodeEventType, RMNodeEvent>(NodeState.NEW) //Transitions from NEW state - .addTransition(NodeState.NEW, NodeState.RUNNING, + .addTransition(NodeState.NEW, + EnumSet.of(NodeState.RUNNING, NodeState.UNHEALTHY), RMNodeEventType.STARTED, new AddNodeTransition()) .addTransition(NodeState.NEW, NodeState.NEW, RMNodeEventType.RESOURCE_UPDATE, @@ -707,7 +709,6 @@ public class RMNodeImpl implements RMNode, EventHandler { private void updateMetricsForRejoinedNode(NodeState previousNodeState) { ClusterMetrics metrics = ClusterMetrics.getMetrics(); - metrics.incrNumActiveNodes(); switch (previousNodeState) { case LOST: @@ -850,10 +851,10 @@ public class RMNodeImpl implements RMNode, EventHandler { } public static class AddNodeTransition implements - SingleArcTransition { + MultipleArcTransition { @Override - public void transition(RMNodeImpl rmNode, RMNodeEvent event) { + public NodeState transition(RMNodeImpl rmNode, RMNodeEvent event) { // Inform the scheduler RMNodeStartedEvent startEvent = (RMNodeStartedEvent) event; List containers = null; @@ -871,8 +872,6 @@ public class RMNodeImpl implements RMNode, EventHandler { if (previousRMNode != null) { ClusterMetrics.getMetrics().decrDecommisionedNMs(); } - // Increment activeNodes explicitly because this is a new node. - ClusterMetrics.getMetrics().incrNumActiveNodes(); containers = startEvent.getNMContainerStatuses(); if (containers != null && !containers.isEmpty()) { for (NMContainerStatus container : containers) { @@ -889,17 +888,37 @@ public class RMNodeImpl implements RMNode, EventHandler { } } - rmNode.context.getDispatcher().getEventHandler() - .handle(new NodeAddedSchedulerEvent(rmNode, containers)); - rmNode.context.getDispatcher().getEventHandler().handle( - new NodesListManagerEvent( - NodesListManagerEventType.NODE_USABLE, rmNode)); + NodeState nodeState; + NodeStatus nodeStatus = + startEvent.getNodeStatus(); + + if (nodeStatus == null) { + nodeState = NodeState.RUNNING; + reportNodeRunning(rmNode, containers); + } else { + RMNodeStatusEvent rmNodeStatusEvent = + new RMNodeStatusEvent(nodeId, nodeStatus); + + NodeHealthStatus nodeHealthStatus = + updateRMNodeFromStatusEvents(rmNode, rmNodeStatusEvent); + + if (nodeHealthStatus.getIsNodeHealthy()) { + nodeState = NodeState.RUNNING; + reportNodeRunning(rmNode, containers); + } else { + nodeState = NodeState.UNHEALTHY; + reportNodeUnusable(rmNode, nodeState); + } + } + List logAggregationReportsForApps = startEvent.getLogAggregationReportsForApps(); if (logAggregationReportsForApps != null && !logAggregationReportsForApps.isEmpty()) { rmNode.handleLogAggregationStatus(logAggregationReportsForApps); } + + return nodeState; } } @@ -1110,6 +1129,22 @@ public class RMNodeImpl implements RMNode, EventHandler { } } + /** + * Report node is RUNNING. + * @param rmNode + * @param containers + */ + public static void reportNodeRunning(RMNodeImpl rmNode, + List containers) { + rmNode.context.getDispatcher().getEventHandler() + .handle(new NodeAddedSchedulerEvent(rmNode, containers)); + rmNode.context.getDispatcher().getEventHandler().handle( + new NodesListManagerEvent( + NodesListManagerEventType.NODE_USABLE, rmNode)); + // Increment activeNodes explicitly because this is a new node. + ClusterMetrics.getMetrics().incrNumActiveNodes(); + } + /** * Report node is UNUSABLE and update metrics. * @param rmNode @@ -1301,6 +1336,7 @@ public class RMNodeImpl implements RMNode, EventHandler { // notifiers get update metadata because they will very likely query it // upon notification // Update metrics + ClusterMetrics.getMetrics().incrNumActiveNodes(); rmNode.updateMetricsForRejoinedNode(NodeState.UNHEALTHY); return NodeState.RUNNING; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeStartedEvent.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeStartedEvent.java index 397699453f..2bf04d0fe7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeStartedEvent.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeStartedEvent.java @@ -24,19 +24,23 @@ import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; +import org.apache.hadoop.yarn.server.api.records.NodeStatus; public class RMNodeStartedEvent extends RMNodeEvent { + private final NodeStatus nodeStatus; private List containerStatuses; private List runningApplications; private List logAggregationReportsForApps; public RMNodeStartedEvent(NodeId nodeId, List containerReports, - List runningApplications) { + List runningApplications, + NodeStatus nodeStatus) { super(nodeId, RMNodeEventType.STARTED); this.containerStatuses = containerReports; this.runningApplications = runningApplications; + this.nodeStatus = nodeStatus; } public List getNMContainerStatuses() { @@ -47,6 +51,10 @@ public class RMNodeStartedEvent extends RMNodeEvent { return runningApplications; } + public NodeStatus getNodeStatus() { + return nodeStatus; + } + public List getLogAggregationReportsForApps() { return this.logAggregationReportsForApps; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java index 3543bc4707..d433753701 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java @@ -18,6 +18,9 @@ package org.apache.hadoop.yarn.server.resourcemanager; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -187,6 +190,17 @@ public class MockNM { req.setNodeLabels(nodeLabels); } + NodeStatus status = Records.newRecord(NodeStatus.class); + status.setResponseId(0); + status.setNodeId(nodeId); + status.setContainersStatuses(new ArrayList<>(containerStats.values())); + NodeHealthStatus healthStatus = Records.newRecord(NodeHealthStatus.class); + healthStatus.setHealthReport(""); + healthStatus.setIsNodeHealthy(true); + healthStatus.setLastHealthReportTime(1); + status.setNodeHealthStatus(healthStatus); + req.setNodeStatus(status); + RegisterNodeManagerResponse registrationResponse = resourceTracker.registerNodeManager(req); this.currentContainerTokenMasterKey = @@ -364,6 +378,14 @@ public class MockNM { return heartbeatResponse; } + public static NodeStatus createMockNodeStatus() { + NodeStatus mockNodeStatus = mock(NodeStatus.class); + NodeHealthStatus mockNodeHealthStatus = mock(NodeHealthStatus.class); + when(mockNodeStatus.getNodeHealthStatus()).thenReturn(mockNodeHealthStatus); + when(mockNodeHealthStatus.getIsNodeHealthy()).thenReturn(true); + return mockNodeStatus; + } + public long getMemory() { return capability.getMemorySize(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java index b3888c3cd6..90c554361c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java @@ -18,6 +18,8 @@ package org.apache.hadoop.yarn.server.resourcemanager; +import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.security.token.Token; @@ -54,6 +56,7 @@ import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.security.AMRMTokenIdentifier; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; +import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncherEvent; import org.apache.hadoop.yarn.server.resourcemanager.amlauncher.ApplicationMasterLauncher; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager; @@ -543,7 +546,9 @@ public class MockRM extends ResourceManager { public void sendNodeStarted(MockNM nm) throws Exception { RMNodeImpl node = (RMNodeImpl) getRMContext().getRMNodes().get( nm.getNodeId()); - node.handle(new RMNodeStartedEvent(nm.getNodeId(), null, null)); + NodeStatus mockNodeStatus = createMockNodeStatus(); + node.handle(new RMNodeStartedEvent(nm.getNodeId(), null, null, + mockNodeStatus)); drainEventsImplicitly(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/NodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/NodeManager.java index 1e4b050816..06c4527e5b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/NodeManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/NodeManager.java @@ -98,7 +98,7 @@ public class NodeManager implements ContainerManagementProtocol { public NodeManager(String hostName, int containerManagerPort, int httpPort, String rackName, Resource capability, - ResourceManager resourceManager) + ResourceManager resourceManager, NodeStatus nodestatus) throws IOException, YarnException { this.containerManagerAddress = hostName + ":" + containerManagerPort; this.nodeHttpAddress = hostName + ":" + httpPort; @@ -113,6 +113,7 @@ public class NodeManager implements ContainerManagementProtocol { request.setResource(capability); request.setNodeId(this.nodeId); request.setNMVersion(YarnVersionInfo.getVersion()); + request.setNodeStatus(nodestatus); resourceTrackerService.registerNodeManager(request); this.resourceManager = resourceManager; resourceManager.getResourceScheduler().getNodeReport(this.nodeId); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMNodeTransitions.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMNodeTransitions.java index 1f1e164cf5..c907cb778e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMNodeTransitions.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMNodeTransitions.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.yarn.server.resourcemanager; +import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus; import static org.junit.Assert.assertEquals; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.doAnswer; @@ -216,8 +217,9 @@ public class TestRMNodeTransitions { @Test (timeout = 5000) public void testExpiredContainer() { + NodeStatus mockNodeStatus = createMockNodeStatus(); // Start the node - node.handle(new RMNodeStartedEvent(null, null, null)); + node.handle(new RMNodeStartedEvent(null, null, null, mockNodeStatus)); verify(scheduler).handle(any(NodeAddedSchedulerEvent.class)); // Expire a container @@ -280,12 +282,13 @@ public class TestRMNodeTransitions { @Test (timeout = 5000) public void testContainerUpdate() throws InterruptedException{ + NodeStatus mockNodeStatus = createMockNodeStatus(); //Start the node - node.handle(new RMNodeStartedEvent(null, null, null)); + node.handle(new RMNodeStartedEvent(null, null, null, mockNodeStatus)); NodeId nodeId = BuilderUtils.newNodeId("localhost:1", 1); RMNodeImpl node2 = new RMNodeImpl(nodeId, rmContext, null, 0, 0, null, null, null); - node2.handle(new RMNodeStartedEvent(null, null, null)); + node2.handle(new RMNodeStartedEvent(null, null, null, mockNodeStatus)); ApplicationId app0 = BuilderUtils.newApplicationId(0, 0); ApplicationId app1 = BuilderUtils.newApplicationId(1, 1); @@ -341,8 +344,9 @@ public class TestRMNodeTransitions { @Test (timeout = 5000) public void testStatusChange(){ + NodeStatus mockNodeStatus = createMockNodeStatus(); //Start the node - node.handle(new RMNodeStartedEvent(null, null, null)); + node.handle(new RMNodeStartedEvent(null, null, null, mockNodeStatus)); //Add info to the queue first node.setNextHeartBeat(false); @@ -607,6 +611,33 @@ public class TestRMNodeTransitions { Assert.assertEquals(NodeState.REBOOTED, node.getState()); } + @Test + public void testAddUnhealthyNode() { + ClusterMetrics cm = ClusterMetrics.getMetrics(); + int initialUnhealthy = cm.getUnhealthyNMs(); + int initialActive = cm.getNumActiveNMs(); + int initialLost = cm.getNumLostNMs(); + int initialDecommissioned = cm.getNumDecommisionedNMs(); + int initialRebooted = cm.getNumRebootedNMs(); + + NodeHealthStatus status = NodeHealthStatus.newInstance(false, "sick", + System.currentTimeMillis()); + NodeStatus nodeStatus = NodeStatus.newInstance(node.getNodeID(), 0, + new ArrayList<>(), null, status, null, null, null); + node.handle(new RMNodeStartedEvent(node.getNodeID(), null, null, + nodeStatus)); + + Assert.assertEquals("Unhealthy Nodes", + initialUnhealthy + 1, cm.getUnhealthyNMs()); + Assert.assertEquals("Active Nodes", initialActive, cm.getNumActiveNMs()); + Assert.assertEquals("Lost Nodes", initialLost, cm.getNumLostNMs()); + Assert.assertEquals("Decommissioned Nodes", + initialDecommissioned, cm.getNumDecommisionedNMs()); + Assert.assertEquals("Rebooted Nodes", + initialRebooted, cm.getNumRebootedNMs()); + Assert.assertEquals(NodeState.UNHEALTHY, node.getState()); + } + @Test public void testNMShutdown() { RMNodeImpl node = getRunningNode(); @@ -712,7 +743,9 @@ public class TestRMNodeTransitions { Resource capability = Resource.newInstance(4096, 4); RMNodeImpl node = new RMNodeImpl(nodeId, rmContext, null, 0, 0, null, capability, nmVersion); - node.handle(new RMNodeStartedEvent(node.getNodeID(), null, null)); + NodeStatus mockNodeStatus = createMockNodeStatus(); + node.handle(new RMNodeStartedEvent(node.getNodeID(), null, null, + mockNodeStatus)); Assert.assertEquals(NodeState.RUNNING, node.getState()); return node; } @@ -763,7 +796,10 @@ public class TestRMNodeTransitions { Resource capability = Resource.newInstance(4096, 4); RMNodeImpl node = new RMNodeImpl(nodeId, rmContext,null, 0, 0, null, capability, null); - node.handle(new RMNodeStartedEvent(node.getNodeID(), null, null)); + NodeStatus mockNodeStatus = createMockNodeStatus(); + + node.handle(new RMNodeStartedEvent(node.getNodeID(), null, null, + mockNodeStatus)); Assert.assertEquals(NodeState.RUNNING, node.getState()); node.handle(new RMNodeEvent(node.getNodeID(), RMNodeEventType.REBOOTING)); Assert.assertEquals(NodeState.REBOOTED, node.getState()); @@ -779,7 +815,9 @@ public class TestRMNodeTransitions { int initialUnhealthy = cm.getUnhealthyNMs(); int initialDecommissioned = cm.getNumDecommisionedNMs(); int initialRebooted = cm.getNumRebootedNMs(); - node.handle(new RMNodeStartedEvent(node.getNodeID(), null, null)); + NodeStatus mockNodeStatus = createMockNodeStatus(); + node.handle(new RMNodeStartedEvent(node.getNodeID(), null, null, + mockNodeStatus)); Assert.assertEquals("Active Nodes", initialActive + 1, cm.getNumActiveNMs()); Assert.assertEquals("Lost Nodes", initialLost, cm.getNumLostNMs()); Assert.assertEquals("Unhealthy Nodes", @@ -1075,8 +1113,9 @@ public class TestRMNodeTransitions { @Test public void testForHandlingDuplicatedCompltedContainers() { + NodeStatus mockNodeStatus = createMockNodeStatus(); // Start the node - node.handle(new RMNodeStartedEvent(null, null, null)); + node.handle(new RMNodeStartedEvent(null, null, null, mockNodeStatus)); // Add info to the queue first node.setNextHeartBeat(false); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceManager.java index 411b848217..1cb5e1d0e7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceManager.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager; +import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.fail; @@ -37,6 +38,7 @@ import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; +import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl; @@ -88,12 +90,12 @@ public class TestResourceManager { private org.apache.hadoop.yarn.server.resourcemanager.NodeManager registerNode(String hostName, int containerManagerPort, int httpPort, - String rackName, Resource capability) throws IOException, - YarnException { + String rackName, Resource capability, NodeStatus nodeStatus) + throws IOException, YarnException { org.apache.hadoop.yarn.server.resourcemanager.NodeManager nm = new org.apache.hadoop.yarn.server.resourcemanager.NodeManager( hostName, containerManagerPort, httpPort, rackName, capability, - resourceManager); + resourceManager, nodeStatus); NodeAddedSchedulerEvent nodeAddEvent1 = new NodeAddedSchedulerEvent(resourceManager.getRMContext() .getRMNodes().get(nm.getNodeId())); @@ -109,26 +111,30 @@ public class TestResourceManager { final int memory = 4 * 1024; final int vcores = 4; - + + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register node1 String host1 = "host1"; org.apache.hadoop.yarn.server.resourcemanager.NodeManager nm1 = registerNode(host1, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(memory, vcores)); + Resources.createResource(memory, vcores), mockNodeStatus); // Register node2 String host2 = "host2"; org.apache.hadoop.yarn.server.resourcemanager.NodeManager nm2 = registerNode(host2, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(memory/2, vcores/2)); + Resources.createResource(memory/2, vcores/2), mockNodeStatus); // nodes should be in RUNNING state RMNodeImpl node1 = (RMNodeImpl) resourceManager.getRMContext().getRMNodes().get( nm1.getNodeId()); RMNodeImpl node2 = (RMNodeImpl) resourceManager.getRMContext().getRMNodes().get( nm2.getNodeId()); - node1.handle(new RMNodeStartedEvent(nm1.getNodeId(), null, null)); - node2.handle(new RMNodeStartedEvent(nm2.getNodeId(), null, null)); + node1.handle(new RMNodeStartedEvent(nm1.getNodeId(), null, null, + mockNodeStatus)); + node2.handle(new RMNodeStartedEvent(nm2.getNodeId(), null, null, + mockNodeStatus)); // Submit an application Application application = new Application("user1", resourceManager); @@ -216,9 +222,12 @@ public class TestResourceManager { public void testNodeHealthReportIsNotNull() throws Exception{ String host1 = "host1"; final int memory = 4 * 1024; + + NodeStatus mockNodeStatus = createMockNodeStatus(); + org.apache.hadoop.yarn.server.resourcemanager.NodeManager nm1 = - registerNode(host1, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(memory, 1)); + registerNode(host1, 1234, 2345, NetworkTopology.DEFAULT_RACK, + Resources.createResource(memory, 1), mockNodeStatus); nm1.heartbeat(); nm1.heartbeat(); Collection values = resourceManager.getRMContext().getRMNodes().values(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java index 6690339d89..066e394561 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java @@ -31,6 +31,8 @@ import org.apache.hadoop.yarn.server.api.ServerRMProxy; import org.apache.hadoop.yarn.server.api.protocolrecords.NodeToAttributes; import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.NodeEventDispatcher; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.FileSystemNodeAttributeStore; + +import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus; import static org.junit.Assert.assertEquals; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; @@ -2712,10 +2714,14 @@ public class TestResourceTrackerService extends NodeLabelTestBase { RegisterNodeManagerRequest.class); NodeId nodeId = NodeId.newInstance("host2", 1234); Resource capability = BuilderUtils.newResource(1024, 1); + + NodeStatus mockNodeStatus = createMockNodeStatus(); + req.setResource(capability); req.setNodeId(nodeId); req.setHttpPort(1234); req.setNMVersion(YarnVersionInfo.getVersion()); + req.setNodeStatus(mockNodeStatus); ContainerId c1 = ContainerId.newContainerId(appAttemptId, 1); ContainerId c2 = ContainerId.newContainerId(appAttemptId, 2); ContainerId c3 = ContainerId.newContainerId(appAttemptId, 3); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/logaggregationstatus/TestRMAppLogAggregationStatus.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/logaggregationstatus/TestRMAppLogAggregationStatus.java index 8d31fe1a8b..6836288ed1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/logaggregationstatus/TestRMAppLogAggregationStatus.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/logaggregationstatus/TestRMAppLogAggregationStatus.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.logaggregationstatus; +import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.mock; @@ -139,13 +140,15 @@ public class TestRMAppLogAggregationStatus { Resource capability = Resource.newInstance(4096, 4); RMNodeImpl node1 = new RMNodeImpl(nodeId1, rmContext, null, 0, 0, null, capability, null); - node1.handle(new RMNodeStartedEvent(nodeId1, null, null)); + NodeStatus mockNodeStatus = createMockNodeStatus(); + node1.handle(new RMNodeStartedEvent(nodeId1, null, null, mockNodeStatus)); rmApp.handle(new RMAppRunningOnNodeEvent(this.appId, nodeId1)); NodeId nodeId2 = NodeId.newInstance("localhost", 2345); RMNodeImpl node2 = new RMNodeImpl(nodeId2, rmContext, null, 0, 0, null, capability, null); - node2.handle(new RMNodeStartedEvent(node2.getNodeID(), null, null)); + node2.handle(new RMNodeStartedEvent(node2.getNodeID(), null, null, + mockNodeStatus)); rmApp.handle(new RMAppRunningOnNodeEvent(this.appId, nodeId2)); // The initial log aggregation status for these two nodes diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMExpiry.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMExpiry.java index f69faf4ea5..017a1e021d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMExpiry.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMExpiry.java @@ -18,6 +18,9 @@ package org.apache.hadoop.yarn.server.resourcemanager.resourcetracker; +import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus; + +import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.junit.Assert; import org.slf4j.Logger; @@ -135,12 +138,15 @@ public class TestNMExpiry { String hostname3 = "localhost3"; Resource capability = BuilderUtils.newResource(1024, 1); + NodeStatus mockNodeStatus = createMockNodeStatus(); + RegisterNodeManagerRequest request1 = recordFactory .newRecordInstance(RegisterNodeManagerRequest.class); NodeId nodeId1 = NodeId.newInstance(hostname1, 0); request1.setNodeId(nodeId1); request1.setHttpPort(0); request1.setResource(capability); + request1.setNodeStatus(mockNodeStatus); resourceTrackerService.registerNodeManager(request1); RegisterNodeManagerRequest request2 = recordFactory @@ -149,6 +155,7 @@ public class TestNMExpiry { request2.setNodeId(nodeId2); request2.setHttpPort(0); request2.setResource(capability); + request2.setNodeStatus(mockNodeStatus); resourceTrackerService.registerNodeManager(request2); int waitCount = 0; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java index 3c4e6b424d..817fb9dfc3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java @@ -18,6 +18,8 @@ package org.apache.hadoop.yarn.server.resourcemanager.resourcetracker; +import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus; + import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -36,6 +38,7 @@ import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest; import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse; import org.apache.hadoop.yarn.server.api.records.NodeAction; +import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.resourcemanager.MockNM; import org.apache.hadoop.yarn.server.resourcemanager.MockRM; import org.apache.hadoop.yarn.server.resourcemanager.ParameterizedSchedulerTestBase; @@ -178,9 +181,13 @@ public class TestNMReconnect extends ParameterizedSchedulerTestBase { RegisterNodeManagerRequest request1 = recordFactory .newRecordInstance(RegisterNodeManagerRequest.class); NodeId nodeId1 = NodeId.newInstance(hostname1, 0); + + NodeStatus mockNodeStatus = createMockNodeStatus(); + request1.setNodeId(nodeId1); request1.setHttpPort(0); request1.setResource(capability); + request1.setNodeStatus(mockNodeStatus); resourceTrackerService.registerNodeManager(request1); Assert.assertNotNull(context.getRMNodes().get(nodeId1)); // verify Scheduler and RMContext use same RMNode reference. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestAbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestAbstractYarnScheduler.java index e67deb5245..2860335d8a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestAbstractYarnScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestAbstractYarnScheduler.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler; +import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus; import static org.junit.Assert.assertEquals; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -49,6 +50,7 @@ import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest; +import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.resourcemanager.MockAM; import org.apache.hadoop.yarn.server.resourcemanager.MockNM; import org.apache.hadoop.yarn.server.resourcemanager.MockNodes; @@ -1051,9 +1053,12 @@ public class TestAbstractYarnScheduler extends ParameterizedSchedulerTestBase { RegisterNodeManagerRequest request1 = recordFactory.newRecordInstance(RegisterNodeManagerRequest.class); NodeId nodeId1 = NodeId.newInstance(hostname1, 0); + NodeStatus mockNodeStatus = createMockNodeStatus(); + request1.setNodeId(nodeId1); request1.setHttpPort(0); request1.setResource(capability); + request1.setNodeStatus(mockNodeStatus); privateResourceTrackerService.registerNodeManager(request1); privateDispatcher.await(); Resource clusterResource = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerHealth.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerHealth.java index 83a354de5a..a75be7745f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerHealth.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerHealth.java @@ -27,6 +27,7 @@ import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.AsyncDispatcher; import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.resourcemanager.NodeManager; import org.apache.hadoop.yarn.server.resourcemanager.Application; import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; @@ -43,6 +44,7 @@ import org.junit.Test; import java.io.IOException; +import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus; import static org.junit.Assume.assumeTrue; public class TestSchedulerHealth { @@ -170,11 +172,11 @@ public class TestSchedulerHealth { } private NodeManager registerNode(String hostName, int containerManagerPort, - int httpPort, String rackName, Resource capability) throws IOException, - YarnException { + int httpPort, String rackName, Resource capability, NodeStatus nodeStatus) + throws IOException, YarnException { NodeManager nm = new NodeManager(hostName, containerManagerPort, httpPort, rackName, - capability, resourceManager); + capability, resourceManager, nodeStatus); NodeAddedSchedulerEvent nodeAddEvent1 = new NodeAddedSchedulerEvent(resourceManager.getRMContext().getRMNodes() .get(nm.getNodeId())); @@ -200,11 +202,13 @@ public class TestSchedulerHealth { assumeTrue("This test is only supported on Capacity Scheduler", isCapacityScheduler); + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register node1 String host_0 = "host_0"; NodeManager nm_0 = registerNode(host_0, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(5 * 1024, 1)); + Resources.createResource(5 * 1024, 1), mockNodeStatus); // ResourceRequest priorities Priority priority_0 = Priority.newInstance(0); @@ -275,15 +279,17 @@ public class TestSchedulerHealth { assumeTrue("This test is only supported on Capacity Scheduler", isCapacityScheduler); + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register nodes String host_0 = "host_0"; NodeManager nm_0 = registerNode(host_0, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(2 * 1024, 1)); + Resources.createResource(2 * 1024, 1), mockNodeStatus); String host_1 = "host_1"; NodeManager nm_1 = registerNode(host_1, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(5 * 1024, 1)); + Resources.createResource(5 * 1024, 1), mockNodeStatus); nodeUpdate(nm_0); nodeUpdate(nm_1); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java index a746f06f27..1fe7a53107 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity; +import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus; import static org.assertj.core.api.Assertions.assertThat; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration.MAXIMUM_ALLOCATION; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration.MAXIMUM_ALLOCATION_MB; @@ -54,6 +55,7 @@ import java.util.concurrent.CyclicBarrier; import com.google.common.collect.Sets; import org.apache.hadoop.service.ServiceStateException; +import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -242,9 +244,10 @@ public class TestCapacityScheduler extends CapacitySchedulerTestBase { private NodeManager registerNode(ResourceManager rm, String hostName, int containerManagerPort, int httpPort, String rackName, - Resource capability) throws IOException, YarnException { + Resource capability, NodeStatus nodeStatus) + throws IOException, YarnException { NodeManager nm = new NodeManager(hostName, - containerManagerPort, httpPort, rackName, capability, rm); + containerManagerPort, httpPort, rackName, capability, rm, nodeStatus); NodeAddedSchedulerEvent nodeAddEvent1 = new NodeAddedSchedulerEvent(rm.getRMContext().getRMNodes() .get(nm.getNodeId())); @@ -286,11 +289,11 @@ public class TestCapacityScheduler extends CapacitySchedulerTestBase { } private NodeManager registerNode(String hostName, int containerManagerPort, - int httpPort, String rackName, - Resource capability) - throws IOException, YarnException { + int httpPort, String rackName, + Resource capability, NodeStatus nodeStatus) + throws IOException, YarnException { NodeManager nm = new NodeManager(hostName, containerManagerPort, httpPort, - rackName, capability, resourceManager); + rackName, capability, resourceManager, nodeStatus); NodeAddedSchedulerEvent nodeAddEvent1 = new NodeAddedSchedulerEvent(resourceManager.getRMContext() .getRMNodes().get(nm.getNodeId())); @@ -303,17 +306,19 @@ public class TestCapacityScheduler extends CapacitySchedulerTestBase { LOG.info("--- START: testCapacityScheduler ---"); + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register node1 String host_0 = "host_0"; NodeManager nm_0 = registerNode(host_0, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(4 * GB, 1)); + Resources.createResource(4 * GB, 1), mockNodeStatus); // Register node2 String host_1 = "host_1"; NodeManager nm_1 = registerNode(host_1, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(2 * GB, 1)); + Resources.createResource(2 * GB, 1), mockNodeStatus); // ResourceRequest priorities Priority priority_0 = Priority.newInstance(0); @@ -443,11 +448,13 @@ public class TestCapacityScheduler extends CapacitySchedulerTestBase { when(mC.getConfigurationProvider()).thenReturn( new LocalConfigurationProvider()); + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register node1 String host0 = "host_0"; NodeManager nm0 = registerNode(rm, host0, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(10 * GB, 10)); + Resources.createResource(10 * GB, 10), mockNodeStatus); // ResourceRequest priorities Priority priority0 = Priority.newInstance(0); @@ -545,11 +552,13 @@ public class TestCapacityScheduler extends CapacitySchedulerTestBase { when(mC.getConfigurationProvider()).thenReturn( new LocalConfigurationProvider()); + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register node1 String host0 = "host_0"; NodeManager nm0 = registerNode(rm, host0, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(10 * GB, 10)); + Resources.createResource(10 * GB, 10), mockNodeStatus); // ResourceRequest priorities Priority priority0 = Priority.newInstance(0); @@ -2097,17 +2106,20 @@ public class TestCapacityScheduler extends CapacitySchedulerTestBase { public void testMoveAppForMoveToQueueWithFreeCap() throws Exception { ResourceScheduler scheduler = resourceManager.getResourceScheduler(); + + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register node1 String host_0 = "host_0"; NodeManager nm_0 = registerNode(host_0, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(4 * GB, 1)); + Resources.createResource(4 * GB, 1), mockNodeStatus); // Register node2 String host_1 = "host_1"; NodeManager nm_1 = registerNode(host_1, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(2 * GB, 1)); + Resources.createResource(2 * GB, 1), mockNodeStatus); // ResourceRequest priorities Priority priority_0 = Priority.newInstance(0); @@ -2213,17 +2225,19 @@ public class TestCapacityScheduler extends CapacitySchedulerTestBase { ResourceScheduler scheduler = resourceManager.getResourceScheduler(); + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register node1 String host_0 = "host_0"; NodeManager nm_0 = registerNode(host_0, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(5 * GB, 1)); + Resources.createResource(5 * GB, 1), mockNodeStatus); // Register node2 String host_1 = "host_1"; NodeManager nm_1 = registerNode(host_1, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(5 * GB, 1)); + Resources.createResource(5 * GB, 1), mockNodeStatus); // ResourceRequest priorities Priority priority_0 = Priority.newInstance(0); @@ -2335,11 +2349,13 @@ public class TestCapacityScheduler extends CapacitySchedulerTestBase { ResourceScheduler scheduler = resourceManager.getResourceScheduler(); + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register node1 String host_0 = "host_0"; NodeManager nm_0 = registerNode(host_0, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(6 * GB, 1)); + Resources.createResource(6 * GB, 1), mockNodeStatus); // ResourceRequest priorities Priority priority_0 = Priority.newInstance(0); @@ -2383,17 +2399,19 @@ public class TestCapacityScheduler extends CapacitySchedulerTestBase { public void testMoveAppQueueMetricsCheck() throws Exception { ResourceScheduler scheduler = resourceManager.getResourceScheduler(); + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register node1 String host_0 = "host_0"; NodeManager nm_0 = registerNode(host_0, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(5 * GB, 1)); + Resources.createResource(5 * GB, 1), mockNodeStatus); // Register node2 String host_1 = "host_1"; NodeManager nm_1 = registerNode(host_1, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(5 * GB, 1)); + Resources.createResource(5 * GB, 1), mockNodeStatus); // ResourceRequest priorities Priority priority_0 = Priority.newInstance(0); @@ -4594,9 +4612,12 @@ public class TestCapacityScheduler extends CapacitySchedulerTestBase { } @Test public void testRemovedNodeDecomissioningNode() throws Exception { + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register nodemanager NodeManager nm = registerNode("host_decom", 1234, 2345, - NetworkTopology.DEFAULT_RACK, Resources.createResource(8 * GB, 4)); + NetworkTopology.DEFAULT_RACK, Resources.createResource(8 * GB, 4), + mockNodeStatus); RMNode node = resourceManager.getRMContext().getRMNodes().get(nm.getNodeId()); @@ -4639,10 +4660,14 @@ public class TestCapacityScheduler extends CapacitySchedulerTestBase { ((CapacityScheduler) resourceManager.getResourceScheduler()) .setRMContext(spyContext); ((AsyncDispatcher) mockDispatcher).start(); + + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register node String host_0 = "host_0"; NodeManager nm_0 = registerNode(host_0, 1234, 2345, - NetworkTopology.DEFAULT_RACK, Resources.createResource(8 * GB, 4)); + NetworkTopology.DEFAULT_RACK, Resources.createResource(8 * GB, 4), + mockNodeStatus); // ResourceRequest priorities Priority priority_0 = Priority.newInstance(0); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java index 2e043fb048..05ec09e667 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java @@ -50,6 +50,7 @@ import org.apache.hadoop.yarn.exceptions.SchedulerInvalidResoureRequestException import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.security.YarnAuthorizationProvider; +import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.resourcemanager.ApplicationMasterService; import org.apache.hadoop.yarn.server.resourcemanager.MockAM; import org.apache.hadoop.yarn.server.resourcemanager.MockNM; @@ -124,6 +125,7 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; +import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus; import static org.assertj.core.api.Assertions.assertThat; import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES; import static org.junit.Assert.assertEquals; @@ -4862,9 +4864,12 @@ public class TestFairScheduler extends FairSchedulerTestBase { @Test public void testRemovedNodeDecomissioningNode() throws Exception { + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register nodemanager NodeManager nm = registerNode("host_decom", 1234, 2345, - NetworkTopology.DEFAULT_RACK, Resources.createResource(8 * GB, 4)); + NetworkTopology.DEFAULT_RACK, Resources.createResource(8 * GB, 4), + mockNodeStatus); RMNode node = resourceManager.getRMContext().getRMNodes().get(nm.getNodeId()); @@ -4907,10 +4912,14 @@ public class TestFairScheduler extends FairSchedulerTestBase { ((FairScheduler) resourceManager.getResourceScheduler()) .setRMContext(spyContext); ((AsyncDispatcher) mockDispatcher).start(); + + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register node String host_0 = "host_0"; NodeManager nm_0 = registerNode(host_0, 1234, 2345, - NetworkTopology.DEFAULT_RACK, Resources.createResource(8 * GB, 4)); + NetworkTopology.DEFAULT_RACK, Resources.createResource(8 * GB, 4), + mockNodeStatus); RMNode node = resourceManager.getRMContext().getRMNodes().get(nm_0.getNodeId()); @@ -4949,11 +4958,13 @@ public class TestFairScheduler extends FairSchedulerTestBase { } private NodeManager registerNode(String hostName, int containerManagerPort, - int httpPort, String rackName, - Resource capability) + int httpPort, String rackName, + Resource capability, NodeStatus nodeStatus) throws IOException, YarnException { + NodeStatus mockNodeStatus = createMockNodeStatus(); + NodeManager nm = new NodeManager(hostName, containerManagerPort, httpPort, - rackName, capability, resourceManager); + rackName, capability, resourceManager, mockNodeStatus); // after YARN-5375, scheduler event is processed in rm main dispatcher, // wait it processed, or may lead dead lock diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/TestFifoScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/TestFifoScheduler.java index 01fb6a79b4..9b3657e00d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/TestFifoScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/TestFifoScheduler.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo; +import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -33,6 +34,7 @@ import java.util.List; import java.util.Map; import org.apache.hadoop.test.GenericTestUtils; +import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.event.Level; @@ -143,10 +145,10 @@ public class TestFifoScheduler { private NodeManager registerNode(String hostName, int containerManagerPort, int nmHttpPort, String rackName, - Resource capability) + Resource capability, NodeStatus nodeStatus) throws IOException, YarnException { NodeManager nm = new NodeManager(hostName, containerManagerPort, - nmHttpPort, rackName, capability, resourceManager); + nmHttpPort, rackName, capability, resourceManager, nodeStatus); NodeAddedSchedulerEvent nodeAddEvent1 = new NodeAddedSchedulerEvent(resourceManager.getRMContext().getRMNodes() .get(nm.getNodeId())); @@ -406,19 +408,21 @@ public class TestFifoScheduler { LOG.info("--- START: testFifoScheduler ---"); final int GB = 1024; - + + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register node1 String host_0 = "host_0"; org.apache.hadoop.yarn.server.resourcemanager.NodeManager nm_0 = registerNode(host_0, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(4 * GB, 1)); + Resources.createResource(4 * GB, 1), mockNodeStatus); nm_0.heartbeat(); // Register node2 String host_1 = "host_1"; org.apache.hadoop.yarn.server.resourcemanager.NodeManager nm_1 = registerNode(host_1, 1234, 2345, NetworkTopology.DEFAULT_RACK, - Resources.createResource(2 * GB, 1)); + Resources.createResource(2 * GB, 1), mockNodeStatus); nm_1.heartbeat(); // ResourceRequest priorities @@ -1197,9 +1201,12 @@ public class TestFifoScheduler { @Test public void testRemovedNodeDecomissioningNode() throws Exception { + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register nodemanager NodeManager nm = registerNode("host_decom", 1234, 2345, - NetworkTopology.DEFAULT_RACK, Resources.createResource(8 * GB, 4)); + NetworkTopology.DEFAULT_RACK, Resources.createResource(8 * GB, 4), + mockNodeStatus); RMNode node = resourceManager.getRMContext().getRMNodes().get(nm.getNodeId()); @@ -1242,10 +1249,14 @@ public class TestFifoScheduler { ((FifoScheduler) resourceManager.getResourceScheduler()) .setRMContext(spyContext); ((AsyncDispatcher) mockDispatcher).start(); + + NodeStatus mockNodeStatus = createMockNodeStatus(); + // Register node String host_0 = "host_0"; NodeManager nm_0 = registerNode(host_0, 1234, 2345, - NetworkTopology.DEFAULT_RACK, Resources.createResource(8 * GB, 4)); + NetworkTopology.DEFAULT_RACK, Resources.createResource(8 * GB, 4), + mockNodeStatus); // ResourceRequest priorities Priority priority_0 = Priority.newInstance(0); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesNodes.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesNodes.java index c3f41f62f6..dc028fe743 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesNodes.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesNodes.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.webapp; +import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus; import static org.assertj.core.api.Assertions.assertThat; import static org.apache.hadoop.yarn.webapp.WebServicesTestUtils.assertResponseStatusCode; import static org.junit.Assert.assertEquals; @@ -241,8 +242,10 @@ public class TestRMWebServicesNodes extends JerseyTestBase { } private void sendStartedEvent(RMNode node) { + NodeStatus mockNodeStatus = createMockNodeStatus(); ((RMNodeImpl) node) - .handle(new RMNodeStartedEvent(node.getNodeID(), null, null)); + .handle(new RMNodeStartedEvent(node.getNodeID(), null, null, + mockNodeStatus)); } private void sendLostEvent(RMNode node) {