diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java
index c39e52ede7..098e7c76da 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java
@@ -20,8 +20,11 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.service.AbstractService;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.api.records.ResourceUtilization;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuNodeResourceUpdateHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
import org.slf4j.Logger;
@@ -46,6 +49,10 @@ public class NodeResourceMonitorImpl extends AbstractService implements
/** Resource calculator. */
private ResourceCalculatorPlugin resourceCalculatorPlugin;
+ /** Gpu related plugin. */
+ private GpuResourcePlugin gpuResourcePlugin;
+ private GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler;
+
/** Current resource utilization of the node. */
private ResourceUtilization nodeUtilization =
ResourceUtilization.newInstance(0, 0, 0f);
@@ -72,6 +79,18 @@ protected void serviceInit(Configuration conf) throws Exception {
this.resourceCalculatorPlugin =
ResourceCalculatorPlugin.getNodeResourceMonitorPlugin(conf);
+ if (nmContext.getResourcePluginManager() != null) {
+ this.gpuResourcePlugin =
+ (GpuResourcePlugin)nmContext.getResourcePluginManager().
+ getNameToPlugins().get(ResourceInformation.GPU_URI);
+
+ if (gpuResourcePlugin != null) {
+ this.gpuNodeResourceUpdateHandler =
+ (GpuNodeResourceUpdateHandler)gpuResourcePlugin.
+ getNodeResourceHandlerInstance();
+ }
+ }
+
LOG.info(" Using ResourceCalculatorPlugin : "
+ this.resourceCalculatorPlugin);
}
@@ -152,6 +171,14 @@ public void run() {
(int) (vmem >> 20), // B -> MB
vcores); // Used Virtual Cores
+ float nodeGpuUtilization = 0F;
+ try {
+ nodeGpuUtilization =
+ gpuNodeResourceUpdateHandler.getNodeGpuUtilization();
+ } catch (Exception e) {
+ LOG.error("Get Node GPU Utilization error: " + e);
+ }
+
// Publish the node utilization metrics to node manager
// metrics system.
NodeManagerMetrics nmMetrics = nmContext.getNodeManagerMetrics();
@@ -159,6 +186,7 @@ public void run() {
nmMetrics.setNodeUsedMemGB(nodeUtilization.getPhysicalMemory());
nmMetrics.setNodeUsedVMemGB(nodeUtilization.getVirtualMemory());
nmMetrics.setNodeCpuUtilization(nodeUtilization.getCPU());
+ nmMetrics.setNodeGpuUtilization(nodeGpuUtilization);
}
try {
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
index afb0d7eda2..af81709566 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
@@ -26,12 +26,14 @@
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.Map;
+import java.util.stream.Collectors;
import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
@@ -76,4 +78,20 @@ public void updateConfiguredResource(Resource res) throws YarnException {
res.setResourceValue(GPU_URI, nUsableGpus);
}
+
+ public float getNodeGpuUtilization() throws Exception{
+ List gpuList =
+ gpuDiscoverer.getGpuDeviceInformation().getGpus();
+ Float totalGpuUtilization = 0F;
+ if (gpuList != null &&
+ gpuList.size() != 0) {
+
+ totalGpuUtilization = gpuList
+ .stream()
+ .map(g -> g.getGpuUtilizations().getOverallGpuUtilization())
+ .collect(Collectors.summingDouble(Float::floatValue))
+ .floatValue() / gpuList.size();
+ }
+ return totalGpuUtilization;
+ }
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java
index abe4529816..848b944528 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java
@@ -98,6 +98,8 @@ public class NodeManagerMetrics {
MutableGaugeInt nodeUsedVMemGB;
@Metric("Current CPU utilization")
MutableGaugeFloat nodeCpuUtilization;
+ @Metric("Current GPU utilization")
+ MutableGaugeFloat nodeGpuUtilization;
@Metric("Missed localization requests in bytes")
MutableCounterLong localizedCacheMissBytes;
@@ -428,6 +430,14 @@ public void setNodeCpuUtilization(float cpuUtilization) {
this.nodeCpuUtilization.set(cpuUtilization);
}
+ public void setNodeGpuUtilization(float nodeGpuUtilization) {
+ this.nodeGpuUtilization.set(nodeGpuUtilization);
+ }
+
+ public float getNodeGpuUtilization() {
+ return nodeGpuUtilization.value();
+ }
+
private void updateLocalizationHitRatios() {
updateLocalizationHitRatio(localizedCacheHitBytes, localizedCacheMissBytes,
localizedCacheHitBytesRatio);
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
index 826cc02219..c67ae86f95 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
@@ -437,14 +437,16 @@ public void testNodeManagerMetricsRecovery() throws Exception {
waitForNMContainerState(cm, cid,
org.apache.hadoop.yarn.server.nodemanager
.containermanager.container.ContainerState.RUNNING);
- TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0, 1, 1, 1, 9, 1, 7);
+ TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0,
+ 1, 1, 1, 9, 1, 7, 0F);
// restart and verify metrics could be recovered
cm.stop();
DefaultMetricsSystem.shutdown();
metrics = NodeManagerMetrics.create();
metrics.addResource(Resource.newInstance(10240, 8));
- TestNodeManagerMetrics.checkMetrics(0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 8);
+ TestNodeManagerMetrics.checkMetrics(0, 0, 0, 0, 0, 0,
+ 0, 0, 10, 0, 8, 0F);
context = createContext(conf, stateStore);
cm = createContainerManager(context, delSrvc);
cm.init(conf);
@@ -452,7 +454,8 @@ public void testNodeManagerMetricsRecovery() throws Exception {
assertEquals(1, context.getApplications().size());
app = context.getApplications().get(appId);
assertNotNull(app);
- TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0, 1, 1, 1, 9, 1, 7);
+ TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0,
+ 1, 1, 1, 9, 1, 7, 0F);
cm.stop();
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java
index 10e6f5afa9..749e0cc14d 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java
@@ -21,11 +21,13 @@
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.thirdparty.com.google.common.collect.Lists;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuUtilizations;
import org.junit.Assert;
import org.junit.Test;
import java.util.List;
@@ -122,4 +124,45 @@ public void testGetNMResourceInfoAutoDiscoveryDisabled()
(NMGpuResourceInfo) target.getNMResourceInfo();
Assert.assertNull(resourceInfo.getGpuDeviceInformation());
}
+
+ @Test
+ public void testNodeGPUUtilization()
+ throws Exception {
+ GpuDiscoverer gpuDiscoverer = createNodeGPUUtilizationDiscoverer();
+
+ GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler =
+ new GpuNodeResourceUpdateHandler(gpuDiscoverer, new Configuration());
+
+ Assert.assertEquals(0.5F,
+ gpuNodeResourceUpdateHandler.getNodeGpuUtilization(), 1e-6);
+ }
+
+ private GpuDiscoverer createNodeGPUUtilizationDiscoverer()
+ throws YarnException {
+ GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class);
+
+ PerGpuDeviceInformation gpu1 =
+ new PerGpuDeviceInformation();
+ PerGpuUtilizations perGpuUtilizations1 =
+ new PerGpuUtilizations();
+ perGpuUtilizations1.setOverallGpuUtilization(0.4F);
+
+ gpu1.setGpuUtilizations(perGpuUtilizations1);
+
+ PerGpuDeviceInformation gpu2 =
+ new PerGpuDeviceInformation();
+ PerGpuUtilizations perGpuUtilizations2 =
+ new PerGpuUtilizations();
+ perGpuUtilizations2.setOverallGpuUtilization(0.6F);
+ gpu2.setGpuUtilizations(perGpuUtilizations2);
+
+ List gpus = Lists.newArrayList();
+ gpus.add(gpu1);
+ gpus.add(gpu2);
+
+ GpuDeviceInformation gpuDeviceInfo = new GpuDeviceInformation();
+ gpuDeviceInfo.setGpus(gpus);
+ when(gpuDiscoverer.getGpuDeviceInformation()).thenReturn(gpuDeviceInfo);
+ return gpuDiscoverer;
+ }
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/metrics/TestNodeManagerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/metrics/TestNodeManagerMetrics.java
index c5f80ba958..37454747c9 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/metrics/TestNodeManagerMetrics.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/metrics/TestNodeManagerMetrics.java
@@ -100,11 +100,15 @@ public void testReferenceOfSingletonJvmMetrics() {
metrics.addContainerLaunchDuration(1);
Assert.assertTrue(metrics.containerLaunchDuration.changed());
+ // Set node gpu utilization
+ metrics.setNodeGpuUtilization(35.5F);
+
// availableGB is expected to be floored,
// while allocatedGB is expected to be ceiled.
// allocatedGB: 3.75GB allocated memory is shown as 4GB
// availableGB: 4.25GB available memory is shown as 4GB
- checkMetrics(10, 1, 1, 1, 1, 1, 4, 7, 4, 13, 3);
+ checkMetrics(10, 1, 1, 1, 1,
+ 1, 4, 7, 4, 13, 3, 35.5F);
// Update resource and check available resource again
metrics.addResource(total);
@@ -116,7 +120,7 @@ public void testReferenceOfSingletonJvmMetrics() {
public static void checkMetrics(int launched, int completed, int failed,
int killed, int initing, int running, int allocatedGB,
int allocatedContainers, int availableGB, int allocatedVCores,
- int availableVCores) {
+ int availableVCores, Float nodeGpuUtilization) {
MetricsRecordBuilder rb = getMetrics("NodeManagerMetrics");
assertCounter("ContainersLaunched", launched, rb);
assertCounter("ContainersCompleted", completed, rb);
@@ -129,6 +133,7 @@ public static void checkMetrics(int launched, int completed, int failed,
assertGauge("AllocatedContainers", allocatedContainers, rb);
assertGauge("AvailableGB", availableGB, rb);
assertGauge("AvailableVCores",availableVCores, rb);
+ assertGauge("NodeGpuUtilization", nodeGpuUtilization, rb);
}
}