YARN-10688. ClusterMetrics should support GPU capacity related metrics.. Contributed by Qi Zhu.

This commit is contained in:
Eric Badger 2021-03-17 18:11:37 +00:00
parent b503de2328
commit 49f89f1d3d
2 changed files with 89 additions and 2 deletions

View File

@ -34,6 +34,8 @@
import org.apache.hadoop.metrics2.lib.MutableRate; import org.apache.hadoop.metrics2.lib.MutableRate;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
@InterfaceAudience.Private @InterfaceAudience.Private
@Metrics(context="yarn") @Metrics(context="yarn")
@ -56,13 +58,14 @@ public class ClusterMetrics {
@Metric("Vcore Utilization") MutableGaugeLong utilizedVirtualCores; @Metric("Vcore Utilization") MutableGaugeLong utilizedVirtualCores;
@Metric("Memory Capability") MutableGaugeLong capabilityMB; @Metric("Memory Capability") MutableGaugeLong capabilityMB;
@Metric("Vcore Capability") MutableGaugeLong capabilityVirtualCores; @Metric("Vcore Capability") MutableGaugeLong capabilityVirtualCores;
@Metric("GPU Capability") MutableGaugeLong capabilityGPUs;
private static final MetricsInfo RECORD_INFO = info("ClusterMetrics", private static final MetricsInfo RECORD_INFO = info("ClusterMetrics",
"Metrics for the Yarn Cluster"); "Metrics for the Yarn Cluster");
private static volatile ClusterMetrics INSTANCE = null; private static volatile ClusterMetrics INSTANCE = null;
private static MetricsRegistry registry; private static MetricsRegistry registry;
public static ClusterMetrics getMetrics() { public static ClusterMetrics getMetrics() {
if(!isInitialized.get()){ if(!isInitialized.get()){
synchronized (ClusterMetrics.class) { synchronized (ClusterMetrics.class) {
@ -206,10 +209,24 @@ public long getCapabilityVirtualCores() {
return capabilityVirtualCores.value(); return capabilityVirtualCores.value();
} }
public long getCapabilityGPUs() {
if (capabilityGPUs == null) {
return 0;
}
return capabilityGPUs.value();
}
public void incrCapability(Resource res) { public void incrCapability(Resource res) {
if (res != null) { if (res != null) {
capabilityMB.incr(res.getMemorySize()); capabilityMB.incr(res.getMemorySize());
capabilityVirtualCores.incr(res.getVirtualCores()); capabilityVirtualCores.incr(res.getVirtualCores());
Integer gpuIndex = ResourceUtils.getResourceTypeIndex()
.get(ResourceInformation.GPU_URI);
if (gpuIndex != null) {
capabilityGPUs.incr(res.
getResourceValue(ResourceInformation.GPU_URI));
}
} }
} }
@ -217,6 +234,12 @@ public void decrCapability(Resource res) {
if (res != null) { if (res != null) {
capabilityMB.decr(res.getMemorySize()); capabilityMB.decr(res.getMemorySize());
capabilityVirtualCores.decr(res.getVirtualCores()); capabilityVirtualCores.decr(res.getVirtualCores());
Integer gpuIndex = ResourceUtils.getResourceTypeIndex()
.get(ResourceInformation.GPU_URI);
if (gpuIndex != null) {
capabilityGPUs.decr(res.
getResourceValue(ResourceInformation.GPU_URI));
}
} }
} }
@ -251,4 +274,4 @@ public long getUtilizedVirtualCores() {
public void incrUtilizedVirtualCores(long delta) { public void incrUtilizedVirtualCores(long delta) {
utilizedVirtualCores.incr(delta); utilizedVirtualCores.incr(delta);
} }
} }

View File

@ -22,18 +22,22 @@
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.resourcemanager.ClusterMetrics;
import org.apache.hadoop.yarn.server.resourcemanager.MockAM; import org.apache.hadoop.yarn.server.resourcemanager.MockAM;
import org.apache.hadoop.yarn.server.resourcemanager.MockNM; import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
import org.apache.hadoop.yarn.server.resourcemanager.MockRM; import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
import org.apache.hadoop.yarn.server.resourcemanager.MockRMAppSubmissionData; import org.apache.hadoop.yarn.server.resourcemanager.MockRMAppSubmissionData;
import org.apache.hadoop.yarn.server.resourcemanager.MockRMAppSubmitter; import org.apache.hadoop.yarn.server.resourcemanager.MockRMAppSubmitter;
import org.apache.hadoop.yarn.server.resourcemanager.MockNodes;
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager;
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
import org.apache.hadoop.yarn.server.resourcemanager.resource.TestResourceProfiles; import org.apache.hadoop.yarn.server.resourcemanager.resource.TestResourceProfiles;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ClusterNodeTracker;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
import org.apache.hadoop.yarn.util.resource.DominantResourceCalculator; import org.apache.hadoop.yarn.util.resource.DominantResourceCalculator;
import org.apache.hadoop.yarn.util.resource.ResourceCalculator; import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
@ -47,8 +51,12 @@
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration.MAXIMUM_ALLOCATION_MB; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration.MAXIMUM_ALLOCATION_MB;
import static org.junit.Assert.assertEquals;
/** /**
* Test case for custom resource container allocation. * Test case for custom resource container allocation.
@ -64,6 +72,9 @@ public class TestCSAllocateCustomResource {
private final int g = 1024; private final int g = 1024;
private ClusterNodeTracker<FiCaSchedulerNode> nodeTracker;
private ClusterMetrics metrics;
@Before @Before
public void setUp() throws Exception { public void setUp() throws Exception {
conf = new YarnConfiguration(); conf = new YarnConfiguration();
@ -182,4 +193,57 @@ public void testCapacitySchedulerInitWithCustomResourceType()
.getResourceValue("yarn.io/gpu")); .getResourceValue("yarn.io/gpu"));
rm.close(); rm.close();
} }
@Test
public void testClusterMetricsWithGPU()
throws Exception {
metrics = ClusterMetrics.getMetrics();
// reset resource types
ResourceUtils.resetResourceTypes();
String resourceTypesFileName = "resource-types-test.xml";
File source = new File(
conf.getClassLoader().getResource(resourceTypesFileName).getFile());
resourceTypesFile = new File(source.getParent(), "resource-types.xml");
FileUtils.copyFile(source, resourceTypesFile);
CapacitySchedulerConfiguration newConf =
(CapacitySchedulerConfiguration) TestUtils
.getConfigurationWithMultipleQueues(conf);
newConf.setClass(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS,
DominantResourceCalculator.class, ResourceCalculator.class);
//start RM
MockRM rm = new MockRM(newConf);
rm.start();
nodeTracker = new ClusterNodeTracker<>();
MockNodes.resetHostIds();
Resource nodeResource = Resource.newInstance(4096, 4,
Collections.singletonMap(GPU_URI, 4L));
List<RMNode> rmNodes =
MockNodes.newNodes(2, 4, nodeResource);
for (RMNode rmNode : rmNodes) {
nodeTracker.addNode(new FiCaSchedulerNode(rmNode, false));
}
// Check GPU inc related cluster metrics.
assertEquals("Cluster Capability Memory incorrect",
metrics.getCapabilityMB(), (4096 * 8));
assertEquals("Cluster Capability Vcores incorrect",
metrics.getCapabilityVirtualCores(), 4 * 8);
assertEquals("Cluster Capability GPUs incorrect",
metrics.getCapabilityGPUs(), 4 * 8);
for (RMNode rmNode : rmNodes) {
nodeTracker.removeNode(rmNode.getNodeID());
}
// Check GPU dec related cluster metrics.
assertEquals("Cluster Capability Memory incorrect",
metrics.getCapabilityMB(), 0);
assertEquals("Cluster Capability Vcores incorrect",
metrics.getCapabilityVirtualCores(), 0);
assertEquals("Cluster Capability GPUs incorrect",
metrics.getCapabilityGPUs(), 0);
ClusterMetrics.destroy();
}
} }