diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java index c39e52ede7..098e7c76da 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java @@ -20,8 +20,11 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.service.AbstractService; +import org.apache.hadoop.yarn.api.records.ResourceInformation; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.api.records.ResourceUtilization; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuNodeResourceUpdateHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin; import org.slf4j.Logger; @@ -46,6 +49,10 @@ public class NodeResourceMonitorImpl extends AbstractService implements /** Resource calculator. */ private ResourceCalculatorPlugin resourceCalculatorPlugin; + /** Gpu related plugin. */ + private GpuResourcePlugin gpuResourcePlugin; + private GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler; + /** Current resource utilization of the node. */ private ResourceUtilization nodeUtilization = ResourceUtilization.newInstance(0, 0, 0f); @@ -72,6 +79,18 @@ protected void serviceInit(Configuration conf) throws Exception { this.resourceCalculatorPlugin = ResourceCalculatorPlugin.getNodeResourceMonitorPlugin(conf); + if (nmContext.getResourcePluginManager() != null) { + this.gpuResourcePlugin = + (GpuResourcePlugin)nmContext.getResourcePluginManager(). + getNameToPlugins().get(ResourceInformation.GPU_URI); + + if (gpuResourcePlugin != null) { + this.gpuNodeResourceUpdateHandler = + (GpuNodeResourceUpdateHandler)gpuResourcePlugin. + getNodeResourceHandlerInstance(); + } + } + LOG.info(" Using ResourceCalculatorPlugin : " + this.resourceCalculatorPlugin); } @@ -152,6 +171,14 @@ public void run() { (int) (vmem >> 20), // B -> MB vcores); // Used Virtual Cores + float nodeGpuUtilization = 0F; + try { + nodeGpuUtilization = + gpuNodeResourceUpdateHandler.getNodeGpuUtilization(); + } catch (Exception e) { + LOG.error("Get Node GPU Utilization error: " + e); + } + // Publish the node utilization metrics to node manager // metrics system. NodeManagerMetrics nmMetrics = nmContext.getNodeManagerMetrics(); @@ -159,6 +186,7 @@ public void run() { nmMetrics.setNodeUsedMemGB(nodeUtilization.getPhysicalMemory()); nmMetrics.setNodeUsedVMemGB(nodeUtilization.getVirtualMemory()); nmMetrics.setNodeCpuUtilization(nodeUtilization.getCPU()); + nmMetrics.setNodeGpuUtilization(nodeGpuUtilization); } try { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java index afb0d7eda2..af81709566 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java @@ -26,12 +26,14 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation; import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI; @@ -76,4 +78,20 @@ public void updateConfiguredResource(Resource res) throws YarnException { res.setResourceValue(GPU_URI, nUsableGpus); } + + public float getNodeGpuUtilization() throws Exception{ + List gpuList = + gpuDiscoverer.getGpuDeviceInformation().getGpus(); + Float totalGpuUtilization = 0F; + if (gpuList != null && + gpuList.size() != 0) { + + totalGpuUtilization = gpuList + .stream() + .map(g -> g.getGpuUtilizations().getOverallGpuUtilization()) + .collect(Collectors.summingDouble(Float::floatValue)) + .floatValue() / gpuList.size(); + } + return totalGpuUtilization; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java index abe4529816..848b944528 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java @@ -98,6 +98,8 @@ public class NodeManagerMetrics { MutableGaugeInt nodeUsedVMemGB; @Metric("Current CPU utilization") MutableGaugeFloat nodeCpuUtilization; + @Metric("Current GPU utilization") + MutableGaugeFloat nodeGpuUtilization; @Metric("Missed localization requests in bytes") MutableCounterLong localizedCacheMissBytes; @@ -428,6 +430,14 @@ public void setNodeCpuUtilization(float cpuUtilization) { this.nodeCpuUtilization.set(cpuUtilization); } + public void setNodeGpuUtilization(float nodeGpuUtilization) { + this.nodeGpuUtilization.set(nodeGpuUtilization); + } + + public float getNodeGpuUtilization() { + return nodeGpuUtilization.value(); + } + private void updateLocalizationHitRatios() { updateLocalizationHitRatio(localizedCacheHitBytes, localizedCacheMissBytes, localizedCacheHitBytesRatio); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java index 826cc02219..c67ae86f95 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java @@ -437,14 +437,16 @@ public void testNodeManagerMetricsRecovery() throws Exception { waitForNMContainerState(cm, cid, org.apache.hadoop.yarn.server.nodemanager .containermanager.container.ContainerState.RUNNING); - TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0, 1, 1, 1, 9, 1, 7); + TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0, + 1, 1, 1, 9, 1, 7, 0F); // restart and verify metrics could be recovered cm.stop(); DefaultMetricsSystem.shutdown(); metrics = NodeManagerMetrics.create(); metrics.addResource(Resource.newInstance(10240, 8)); - TestNodeManagerMetrics.checkMetrics(0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 8); + TestNodeManagerMetrics.checkMetrics(0, 0, 0, 0, 0, 0, + 0, 0, 10, 0, 8, 0F); context = createContext(conf, stateStore); cm = createContainerManager(context, delSrvc); cm.init(conf); @@ -452,7 +454,8 @@ public void testNodeManagerMetricsRecovery() throws Exception { assertEquals(1, context.getApplications().size()); app = context.getApplications().get(appId); assertNotNull(app); - TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0, 1, 1, 1, 9, 1, 7); + TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0, + 1, 1, 1, 9, 1, 7, 0F); cm.stop(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java index 10e6f5afa9..749e0cc14d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java @@ -21,11 +21,13 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.thirdparty.com.google.common.collect.Lists; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuUtilizations; import org.junit.Assert; import org.junit.Test; import java.util.List; @@ -122,4 +124,45 @@ public void testGetNMResourceInfoAutoDiscoveryDisabled() (NMGpuResourceInfo) target.getNMResourceInfo(); Assert.assertNull(resourceInfo.getGpuDeviceInformation()); } + + @Test + public void testNodeGPUUtilization() + throws Exception { + GpuDiscoverer gpuDiscoverer = createNodeGPUUtilizationDiscoverer(); + + GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler = + new GpuNodeResourceUpdateHandler(gpuDiscoverer, new Configuration()); + + Assert.assertEquals(0.5F, + gpuNodeResourceUpdateHandler.getNodeGpuUtilization(), 1e-6); + } + + private GpuDiscoverer createNodeGPUUtilizationDiscoverer() + throws YarnException { + GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class); + + PerGpuDeviceInformation gpu1 = + new PerGpuDeviceInformation(); + PerGpuUtilizations perGpuUtilizations1 = + new PerGpuUtilizations(); + perGpuUtilizations1.setOverallGpuUtilization(0.4F); + + gpu1.setGpuUtilizations(perGpuUtilizations1); + + PerGpuDeviceInformation gpu2 = + new PerGpuDeviceInformation(); + PerGpuUtilizations perGpuUtilizations2 = + new PerGpuUtilizations(); + perGpuUtilizations2.setOverallGpuUtilization(0.6F); + gpu2.setGpuUtilizations(perGpuUtilizations2); + + List gpus = Lists.newArrayList(); + gpus.add(gpu1); + gpus.add(gpu2); + + GpuDeviceInformation gpuDeviceInfo = new GpuDeviceInformation(); + gpuDeviceInfo.setGpus(gpus); + when(gpuDiscoverer.getGpuDeviceInformation()).thenReturn(gpuDeviceInfo); + return gpuDiscoverer; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/metrics/TestNodeManagerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/metrics/TestNodeManagerMetrics.java index c5f80ba958..37454747c9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/metrics/TestNodeManagerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/metrics/TestNodeManagerMetrics.java @@ -100,11 +100,15 @@ public void testReferenceOfSingletonJvmMetrics() { metrics.addContainerLaunchDuration(1); Assert.assertTrue(metrics.containerLaunchDuration.changed()); + // Set node gpu utilization + metrics.setNodeGpuUtilization(35.5F); + // availableGB is expected to be floored, // while allocatedGB is expected to be ceiled. // allocatedGB: 3.75GB allocated memory is shown as 4GB // availableGB: 4.25GB available memory is shown as 4GB - checkMetrics(10, 1, 1, 1, 1, 1, 4, 7, 4, 13, 3); + checkMetrics(10, 1, 1, 1, 1, + 1, 4, 7, 4, 13, 3, 35.5F); // Update resource and check available resource again metrics.addResource(total); @@ -116,7 +120,7 @@ public void testReferenceOfSingletonJvmMetrics() { public static void checkMetrics(int launched, int completed, int failed, int killed, int initing, int running, int allocatedGB, int allocatedContainers, int availableGB, int allocatedVCores, - int availableVCores) { + int availableVCores, Float nodeGpuUtilization) { MetricsRecordBuilder rb = getMetrics("NodeManagerMetrics"); assertCounter("ContainersLaunched", launched, rb); assertCounter("ContainersCompleted", completed, rb); @@ -129,6 +133,7 @@ public static void checkMetrics(int launched, int completed, int failed, assertGauge("AllocatedContainers", allocatedContainers, rb); assertGauge("AvailableGB", availableGB, rb); assertGauge("AvailableVCores",availableVCores, rb); + assertGauge("NodeGpuUtilization", nodeGpuUtilization, rb); } }