diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java index d3be720c49..2c9baf2305 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java @@ -52,14 +52,17 @@ public class GpuResourceHandlerImpl implements ResourceHandler { private final GpuResourceAllocator gpuAllocator; private final CGroupsHandler cGroupsHandler; private final PrivilegedOperationExecutor privilegedOperationExecutor; + private final GpuDiscoverer gpuDiscoverer; public GpuResourceHandlerImpl(Context nmContext, CGroupsHandler cGroupsHandler, - PrivilegedOperationExecutor privilegedOperationExecutor) { + PrivilegedOperationExecutor privilegedOperationExecutor, + GpuDiscoverer gpuDiscoverer) { this.nmContext = nmContext; this.cGroupsHandler = cGroupsHandler; this.privilegedOperationExecutor = privilegedOperationExecutor; - gpuAllocator = new GpuResourceAllocator(nmContext); + this.gpuAllocator = new GpuResourceAllocator(nmContext); + this.gpuDiscoverer = gpuDiscoverer; } @Override @@ -67,7 +70,7 @@ public List bootstrap(Configuration configuration) throws ResourceHandlerException { List usableGpus; try { - usableGpus = GpuDiscoverer.getInstance().getGpusUsableByYarn(); + usableGpus = gpuDiscoverer.getGpusUsableByYarn(); if (usableGpus == null || usableGpus.isEmpty()) { String message = "GPU is enabled on the NodeManager, but couldn't find " + "any usable GPU devices, please double check configuration!"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java index 9574980781..de061d6016 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java @@ -34,6 +34,8 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DeviceMappingManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DevicePluginAdapter; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.FpgaResourcePlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuNodeResourceUpdateHandler; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin; import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.slf4j.Logger; @@ -96,7 +98,10 @@ public synchronized void initialize(Context context) ResourcePlugin plugin = null; if (resourceName.equals(GPU_URI)) { - plugin = new GpuResourcePlugin(); + final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer(); + final GpuNodeResourceUpdateHandler updateHandler = + new GpuNodeResourceUpdateHandler(gpuDiscoverer); + plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer); } else if (resourceName.equals(FPGA_URI)) { plugin = new FpgaResourcePlugin(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 92792b7ba6..334a86c2c8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -58,11 +58,6 @@ public class GpuDiscoverer { // command should not run more than 10 sec. private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; private static final int MAX_REPEATED_ERROR_ALLOWED = 10; - private static GpuDiscoverer instance; - - static { - instance = new GpuDiscoverer(); - } private Configuration conf = null; private String pathOfGpuBinary = null; @@ -293,8 +288,4 @@ Map getEnvironmentToRunCommand() { String getPathOfGpuBinary() { return pathOfGpuBinary; } - - public static GpuDiscoverer getInstance() { - return instance; - } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java index 8b19048c8b..4b2258d557 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java @@ -35,16 +35,20 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin { private static final Logger LOG = LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class); + private final GpuDiscoverer gpuDiscoverer; + + public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) { + this.gpuDiscoverer = gpuDiscoverer; + } @Override public void updateConfiguredResource(Resource res) throws YarnException { LOG.info("Initializing configured GPU resources for the NodeManager."); - List usableGpus = GpuDiscoverer.getInstance() - .getGpusUsableByYarn(); + List usableGpus = gpuDiscoverer.getGpusUsableByYarn(); if (usableGpus == null || usableGpus.isEmpty()) { String message = "GPU is enabled, " + - "but couldn't find any usable GPUs on the NodeManager!"; + "but could not find any usable GPUs on the NodeManager!"; LOG.error(message); // No gpu can be used by YARN. throw new YarnException(message); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java index e49d2f24bd..393d76e1f5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java @@ -18,7 +18,6 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; -import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; @@ -34,18 +33,23 @@ import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo; import java.util.List; -import java.util.Map; public class GpuResourcePlugin implements ResourcePlugin { + private final GpuNodeResourceUpdateHandler resourceDiscoverHandler; + private final GpuDiscoverer gpuDiscoverer; private GpuResourceHandlerImpl gpuResourceHandler = null; - private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null; private DockerCommandPlugin dockerCommandPlugin = null; + public GpuResourcePlugin(GpuNodeResourceUpdateHandler resourceDiscoverHandler, + GpuDiscoverer gpuDiscoverer) { + this.resourceDiscoverHandler = resourceDiscoverHandler; + this.gpuDiscoverer = gpuDiscoverer; + } + @Override public synchronized void initialize(Context context) throws YarnException { - resourceDiscoverHandler = new GpuNodeResourceUpdateHandler(); - GpuDiscoverer.getInstance().initialize(context.getConf()); - dockerCommandPlugin = + this.gpuDiscoverer.initialize(context.getConf()); + this.dockerCommandPlugin = GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin( context.getConf()); } @@ -56,7 +60,7 @@ public synchronized ResourceHandler createResourceHandler( PrivilegedOperationExecutor privilegedOperationExecutor) { if (gpuResourceHandler == null) { gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler, - privilegedOperationExecutor); + privilegedOperationExecutor, gpuDiscoverer); } return gpuResourceHandler; @@ -77,9 +81,9 @@ public DockerCommandPlugin getDockerCommandPluginInstance() { } @Override - public NMResourceInfo getNMResourceInfo() throws YarnException { + public synchronized NMResourceInfo getNMResourceInfo() throws YarnException { GpuDeviceInformation gpuDeviceInformation = - GpuDiscoverer.getInstance().getGpuDeviceInformation(); + gpuDiscoverer.getGpuDeviceInformation(); GpuResourceAllocator gpuResourceAllocator = gpuResourceHandler.getGpuAllocator(); List totalGpus = gpuResourceAllocator.getAllowedGpusCopy(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java index fff9068442..fb0df39ed2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java @@ -71,6 +71,7 @@ public class TestGpuResourceHandler { private GpuResourceHandlerImpl gpuResourceHandler; private NMStateStoreService mockNMStateStore; private ConcurrentHashMap runningContainersMap; + private GpuDiscoverer gpuDiscoverer; @Before public void setup() { @@ -89,8 +90,9 @@ public void setup() { runningContainersMap = new ConcurrentHashMap<>(); when(nmctx.getContainers()).thenReturn(runningContainersMap); + gpuDiscoverer = new GpuDiscoverer(); gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler, - mockPrivilegedExecutor); + mockPrivilegedExecutor, gpuDiscoverer); } @Test @@ -98,7 +100,7 @@ public void testBootStrap() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0"); - GpuDiscoverer.getInstance().initialize(conf); + gpuDiscoverer.initialize(conf); gpuResourceHandler.bootstrap(conf); verify(mockCGroupsHandler, times(1)).initializeCGroupController( @@ -162,7 +164,7 @@ private void commonTestAllocation(boolean dockerContainerEnabled) throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + gpuDiscoverer.initialize(conf); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -251,7 +253,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + gpuDiscoverer.initialize(conf); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -280,7 +282,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails() public void testAllocationWithoutAllowedGpus() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " "); - GpuDiscoverer.getInstance().initialize(conf); + gpuDiscoverer.initialize(conf); try { gpuResourceHandler.bootstrap(conf); @@ -315,7 +317,7 @@ public void testAllocationWithoutAllowedGpus() throws Exception { public void testAllocationStored() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + gpuDiscoverer.initialize(conf); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -361,9 +363,9 @@ public void testAllocationStoredWithNULLStateStore() throws Exception { GpuResourceHandlerImpl gpuNULLStateResourceHandler = new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler, - mockPrivilegedExecutor); + mockPrivilegedExecutor, gpuDiscoverer); - GpuDiscoverer.getInstance().initialize(conf); + gpuDiscoverer.initialize(conf); gpuNULLStateResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -383,7 +385,7 @@ public void testAllocationStoredWithNULLStateStore() throws Exception { public void testRecoverResourceAllocation() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + gpuDiscoverer.initialize(conf); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4,