YARN-9121. Replace GpuDiscoverer.getInstance() to a readable object for easy access control. Contributed by Szilard Nemeth.

This commit is contained in:
Sunil G 2019-02-25 11:30:46 +05:30
parent 92b1fdcece
commit 5e91ebd91a
6 changed files with 43 additions and 34 deletions

View File

@ -52,14 +52,17 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
private final GpuResourceAllocator gpuAllocator;
private final CGroupsHandler cGroupsHandler;
private final PrivilegedOperationExecutor privilegedOperationExecutor;
private final GpuDiscoverer gpuDiscoverer;
public GpuResourceHandlerImpl(Context nmContext,
CGroupsHandler cGroupsHandler,
PrivilegedOperationExecutor privilegedOperationExecutor) {
PrivilegedOperationExecutor privilegedOperationExecutor,
GpuDiscoverer gpuDiscoverer) {
this.nmContext = nmContext;
this.cGroupsHandler = cGroupsHandler;
this.privilegedOperationExecutor = privilegedOperationExecutor;
gpuAllocator = new GpuResourceAllocator(nmContext);
this.gpuAllocator = new GpuResourceAllocator(nmContext);
this.gpuDiscoverer = gpuDiscoverer;
}
@Override
@ -67,7 +70,7 @@ public List<PrivilegedOperation> bootstrap(Configuration configuration)
throws ResourceHandlerException {
List<GpuDevice> usableGpus;
try {
usableGpus = GpuDiscoverer.getInstance().getGpusUsableByYarn();
usableGpus = gpuDiscoverer.getGpusUsableByYarn();
if (usableGpus == null || usableGpus.isEmpty()) {
String message = "GPU is enabled on the NodeManager, but couldn't find "
+ "any usable GPU devices, please double check configuration!";

View File

@ -34,6 +34,8 @@
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DeviceMappingManager;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DevicePluginAdapter;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.FpgaResourcePlugin;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuNodeResourceUpdateHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin;
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.slf4j.Logger;
@ -96,7 +98,10 @@ public synchronized void initialize(Context context)
ResourcePlugin plugin = null;
if (resourceName.equals(GPU_URI)) {
plugin = new GpuResourcePlugin();
final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer();
final GpuNodeResourceUpdateHandler updateHandler =
new GpuNodeResourceUpdateHandler(gpuDiscoverer);
plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer);
} else if (resourceName.equals(FPGA_URI)) {
plugin = new FpgaResourcePlugin();
}

View File

@ -58,11 +58,6 @@ public class GpuDiscoverer {
// command should not run more than 10 sec.
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
private static GpuDiscoverer instance;
static {
instance = new GpuDiscoverer();
}
private Configuration conf = null;
private String pathOfGpuBinary = null;
@ -293,8 +288,4 @@ Map<String, String> getEnvironmentToRunCommand() {
String getPathOfGpuBinary() {
return pathOfGpuBinary;
}
public static GpuDiscoverer getInstance() {
return instance;
}
}

View File

@ -35,16 +35,20 @@
public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
private static final Logger LOG =
LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
private final GpuDiscoverer gpuDiscoverer;
public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) {
this.gpuDiscoverer = gpuDiscoverer;
}
@Override
public void updateConfiguredResource(Resource res) throws YarnException {
LOG.info("Initializing configured GPU resources for the NodeManager.");
List<GpuDevice> usableGpus = GpuDiscoverer.getInstance()
.getGpusUsableByYarn();
List<GpuDevice> usableGpus = gpuDiscoverer.getGpusUsableByYarn();
if (usableGpus == null || usableGpus.isEmpty()) {
String message = "GPU is enabled, " +
"but couldn't find any usable GPUs on the NodeManager!";
"but could not find any usable GPUs on the NodeManager!";
LOG.error(message);
// No gpu can be used by YARN.
throw new YarnException(message);

View File

@ -18,7 +18,6 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
@ -34,18 +33,23 @@
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
import java.util.List;
import java.util.Map;
public class GpuResourcePlugin implements ResourcePlugin {
private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
private final GpuDiscoverer gpuDiscoverer;
private GpuResourceHandlerImpl gpuResourceHandler = null;
private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null;
private DockerCommandPlugin dockerCommandPlugin = null;
public GpuResourcePlugin(GpuNodeResourceUpdateHandler resourceDiscoverHandler,
GpuDiscoverer gpuDiscoverer) {
this.resourceDiscoverHandler = resourceDiscoverHandler;
this.gpuDiscoverer = gpuDiscoverer;
}
@Override
public synchronized void initialize(Context context) throws YarnException {
resourceDiscoverHandler = new GpuNodeResourceUpdateHandler();
GpuDiscoverer.getInstance().initialize(context.getConf());
dockerCommandPlugin =
this.gpuDiscoverer.initialize(context.getConf());
this.dockerCommandPlugin =
GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin(
context.getConf());
}
@ -56,7 +60,7 @@ public synchronized ResourceHandler createResourceHandler(
PrivilegedOperationExecutor privilegedOperationExecutor) {
if (gpuResourceHandler == null) {
gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler,
privilegedOperationExecutor);
privilegedOperationExecutor, gpuDiscoverer);
}
return gpuResourceHandler;
@ -77,9 +81,9 @@ public DockerCommandPlugin getDockerCommandPluginInstance() {
}
@Override
public NMResourceInfo getNMResourceInfo() throws YarnException {
public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
GpuDeviceInformation gpuDeviceInformation =
GpuDiscoverer.getInstance().getGpuDeviceInformation();
gpuDiscoverer.getGpuDeviceInformation();
GpuResourceAllocator gpuResourceAllocator =
gpuResourceHandler.getGpuAllocator();
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpusCopy();

View File

@ -71,6 +71,7 @@ public class TestGpuResourceHandler {
private GpuResourceHandlerImpl gpuResourceHandler;
private NMStateStoreService mockNMStateStore;
private ConcurrentHashMap<ContainerId, Container> runningContainersMap;
private GpuDiscoverer gpuDiscoverer;
@Before
public void setup() {
@ -89,8 +90,9 @@ public void setup() {
runningContainersMap = new ConcurrentHashMap<>();
when(nmctx.getContainers()).thenReturn(runningContainersMap);
gpuDiscoverer = new GpuDiscoverer();
gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler,
mockPrivilegedExecutor);
mockPrivilegedExecutor, gpuDiscoverer);
}
@Test
@ -98,7 +100,7 @@ public void testBootStrap() throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
GpuDiscoverer.getInstance().initialize(conf);
gpuDiscoverer.initialize(conf);
gpuResourceHandler.bootstrap(conf);
verify(mockCGroupsHandler, times(1)).initializeCGroupController(
@ -162,7 +164,7 @@ private void commonTestAllocation(boolean dockerContainerEnabled)
throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf);
gpuDiscoverer.initialize(conf);
gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4,
@ -251,7 +253,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf);
gpuDiscoverer.initialize(conf);
gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4,
@ -280,7 +282,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
public void testAllocationWithoutAllowedGpus() throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
GpuDiscoverer.getInstance().initialize(conf);
gpuDiscoverer.initialize(conf);
try {
gpuResourceHandler.bootstrap(conf);
@ -315,7 +317,7 @@ public void testAllocationWithoutAllowedGpus() throws Exception {
public void testAllocationStored() throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf);
gpuDiscoverer.initialize(conf);
gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4,
@ -361,9 +363,9 @@ public void testAllocationStoredWithNULLStateStore() throws Exception {
GpuResourceHandlerImpl gpuNULLStateResourceHandler =
new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
mockPrivilegedExecutor);
mockPrivilegedExecutor, gpuDiscoverer);
GpuDiscoverer.getInstance().initialize(conf);
gpuDiscoverer.initialize(conf);
gpuNULLStateResourceHandler.bootstrap(conf);
Assert.assertEquals(4,
@ -383,7 +385,7 @@ public void testAllocationStoredWithNULLStateStore() throws Exception {
public void testRecoverResourceAllocation() throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf);
gpuDiscoverer.initialize(conf);
gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4,