YARN-9121. Replace GpuDiscoverer.getInstance() to a readable object for easy access control. Contributed by Szilard Nemeth.

This commit is contained in:
Sunil G 2019-02-25 11:30:46 +05:30
parent 92b1fdcece
commit 5e91ebd91a
6 changed files with 43 additions and 34 deletions

View File

@ -52,14 +52,17 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
private final GpuResourceAllocator gpuAllocator; private final GpuResourceAllocator gpuAllocator;
private final CGroupsHandler cGroupsHandler; private final CGroupsHandler cGroupsHandler;
private final PrivilegedOperationExecutor privilegedOperationExecutor; private final PrivilegedOperationExecutor privilegedOperationExecutor;
private final GpuDiscoverer gpuDiscoverer;
public GpuResourceHandlerImpl(Context nmContext, public GpuResourceHandlerImpl(Context nmContext,
CGroupsHandler cGroupsHandler, CGroupsHandler cGroupsHandler,
PrivilegedOperationExecutor privilegedOperationExecutor) { PrivilegedOperationExecutor privilegedOperationExecutor,
GpuDiscoverer gpuDiscoverer) {
this.nmContext = nmContext; this.nmContext = nmContext;
this.cGroupsHandler = cGroupsHandler; this.cGroupsHandler = cGroupsHandler;
this.privilegedOperationExecutor = privilegedOperationExecutor; this.privilegedOperationExecutor = privilegedOperationExecutor;
gpuAllocator = new GpuResourceAllocator(nmContext); this.gpuAllocator = new GpuResourceAllocator(nmContext);
this.gpuDiscoverer = gpuDiscoverer;
} }
@Override @Override
@ -67,7 +70,7 @@ public List<PrivilegedOperation> bootstrap(Configuration configuration)
throws ResourceHandlerException { throws ResourceHandlerException {
List<GpuDevice> usableGpus; List<GpuDevice> usableGpus;
try { try {
usableGpus = GpuDiscoverer.getInstance().getGpusUsableByYarn(); usableGpus = gpuDiscoverer.getGpusUsableByYarn();
if (usableGpus == null || usableGpus.isEmpty()) { if (usableGpus == null || usableGpus.isEmpty()) {
String message = "GPU is enabled on the NodeManager, but couldn't find " String message = "GPU is enabled on the NodeManager, but couldn't find "
+ "any usable GPU devices, please double check configuration!"; + "any usable GPU devices, please double check configuration!";

View File

@ -34,6 +34,8 @@
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DeviceMappingManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DeviceMappingManager;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DevicePluginAdapter; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DevicePluginAdapter;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.FpgaResourcePlugin; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.FpgaResourcePlugin;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuNodeResourceUpdateHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin;
import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -96,7 +98,10 @@ public synchronized void initialize(Context context)
ResourcePlugin plugin = null; ResourcePlugin plugin = null;
if (resourceName.equals(GPU_URI)) { if (resourceName.equals(GPU_URI)) {
plugin = new GpuResourcePlugin(); final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer();
final GpuNodeResourceUpdateHandler updateHandler =
new GpuNodeResourceUpdateHandler(gpuDiscoverer);
plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer);
} else if (resourceName.equals(FPGA_URI)) { } else if (resourceName.equals(FPGA_URI)) {
plugin = new FpgaResourcePlugin(); plugin = new FpgaResourcePlugin();
} }

View File

@ -58,11 +58,6 @@ public class GpuDiscoverer {
// command should not run more than 10 sec. // command should not run more than 10 sec.
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
private static final int MAX_REPEATED_ERROR_ALLOWED = 10; private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
private static GpuDiscoverer instance;
static {
instance = new GpuDiscoverer();
}
private Configuration conf = null; private Configuration conf = null;
private String pathOfGpuBinary = null; private String pathOfGpuBinary = null;
@ -293,8 +288,4 @@ Map<String, String> getEnvironmentToRunCommand() {
String getPathOfGpuBinary() { String getPathOfGpuBinary() {
return pathOfGpuBinary; return pathOfGpuBinary;
} }
public static GpuDiscoverer getInstance() {
return instance;
}
} }

View File

@ -35,16 +35,20 @@
public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin { public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
private static final Logger LOG = private static final Logger LOG =
LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class); LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
private final GpuDiscoverer gpuDiscoverer;
public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) {
this.gpuDiscoverer = gpuDiscoverer;
}
@Override @Override
public void updateConfiguredResource(Resource res) throws YarnException { public void updateConfiguredResource(Resource res) throws YarnException {
LOG.info("Initializing configured GPU resources for the NodeManager."); LOG.info("Initializing configured GPU resources for the NodeManager.");
List<GpuDevice> usableGpus = GpuDiscoverer.getInstance() List<GpuDevice> usableGpus = gpuDiscoverer.getGpusUsableByYarn();
.getGpusUsableByYarn();
if (usableGpus == null || usableGpus.isEmpty()) { if (usableGpus == null || usableGpus.isEmpty()) {
String message = "GPU is enabled, " + String message = "GPU is enabled, " +
"but couldn't find any usable GPUs on the NodeManager!"; "but could not find any usable GPUs on the NodeManager!";
LOG.error(message); LOG.error(message);
// No gpu can be used by YARN. // No gpu can be used by YARN.
throw new YarnException(message); throw new YarnException(message);

View File

@ -18,7 +18,6 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
@ -34,18 +33,23 @@
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
import java.util.List; import java.util.List;
import java.util.Map;
public class GpuResourcePlugin implements ResourcePlugin { public class GpuResourcePlugin implements ResourcePlugin {
private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
private final GpuDiscoverer gpuDiscoverer;
private GpuResourceHandlerImpl gpuResourceHandler = null; private GpuResourceHandlerImpl gpuResourceHandler = null;
private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null;
private DockerCommandPlugin dockerCommandPlugin = null; private DockerCommandPlugin dockerCommandPlugin = null;
public GpuResourcePlugin(GpuNodeResourceUpdateHandler resourceDiscoverHandler,
GpuDiscoverer gpuDiscoverer) {
this.resourceDiscoverHandler = resourceDiscoverHandler;
this.gpuDiscoverer = gpuDiscoverer;
}
@Override @Override
public synchronized void initialize(Context context) throws YarnException { public synchronized void initialize(Context context) throws YarnException {
resourceDiscoverHandler = new GpuNodeResourceUpdateHandler(); this.gpuDiscoverer.initialize(context.getConf());
GpuDiscoverer.getInstance().initialize(context.getConf()); this.dockerCommandPlugin =
dockerCommandPlugin =
GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin( GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin(
context.getConf()); context.getConf());
} }
@ -56,7 +60,7 @@ public synchronized ResourceHandler createResourceHandler(
PrivilegedOperationExecutor privilegedOperationExecutor) { PrivilegedOperationExecutor privilegedOperationExecutor) {
if (gpuResourceHandler == null) { if (gpuResourceHandler == null) {
gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler, gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler,
privilegedOperationExecutor); privilegedOperationExecutor, gpuDiscoverer);
} }
return gpuResourceHandler; return gpuResourceHandler;
@ -77,9 +81,9 @@ public DockerCommandPlugin getDockerCommandPluginInstance() {
} }
@Override @Override
public NMResourceInfo getNMResourceInfo() throws YarnException { public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
GpuDeviceInformation gpuDeviceInformation = GpuDeviceInformation gpuDeviceInformation =
GpuDiscoverer.getInstance().getGpuDeviceInformation(); gpuDiscoverer.getGpuDeviceInformation();
GpuResourceAllocator gpuResourceAllocator = GpuResourceAllocator gpuResourceAllocator =
gpuResourceHandler.getGpuAllocator(); gpuResourceHandler.getGpuAllocator();
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpusCopy(); List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpusCopy();

View File

@ -71,6 +71,7 @@ public class TestGpuResourceHandler {
private GpuResourceHandlerImpl gpuResourceHandler; private GpuResourceHandlerImpl gpuResourceHandler;
private NMStateStoreService mockNMStateStore; private NMStateStoreService mockNMStateStore;
private ConcurrentHashMap<ContainerId, Container> runningContainersMap; private ConcurrentHashMap<ContainerId, Container> runningContainersMap;
private GpuDiscoverer gpuDiscoverer;
@Before @Before
public void setup() { public void setup() {
@ -89,8 +90,9 @@ public void setup() {
runningContainersMap = new ConcurrentHashMap<>(); runningContainersMap = new ConcurrentHashMap<>();
when(nmctx.getContainers()).thenReturn(runningContainersMap); when(nmctx.getContainers()).thenReturn(runningContainersMap);
gpuDiscoverer = new GpuDiscoverer();
gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler, gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler,
mockPrivilegedExecutor); mockPrivilegedExecutor, gpuDiscoverer);
} }
@Test @Test
@ -98,7 +100,7 @@ public void testBootStrap() throws Exception {
Configuration conf = new YarnConfiguration(); Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0"); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
GpuDiscoverer.getInstance().initialize(conf); gpuDiscoverer.initialize(conf);
gpuResourceHandler.bootstrap(conf); gpuResourceHandler.bootstrap(conf);
verify(mockCGroupsHandler, times(1)).initializeCGroupController( verify(mockCGroupsHandler, times(1)).initializeCGroupController(
@ -162,7 +164,7 @@ private void commonTestAllocation(boolean dockerContainerEnabled)
throws Exception { throws Exception {
Configuration conf = new YarnConfiguration(); Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf); gpuDiscoverer.initialize(conf);
gpuResourceHandler.bootstrap(conf); gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4, Assert.assertEquals(4,
@ -251,7 +253,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
throws Exception { throws Exception {
Configuration conf = new YarnConfiguration(); Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf); gpuDiscoverer.initialize(conf);
gpuResourceHandler.bootstrap(conf); gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4, Assert.assertEquals(4,
@ -280,7 +282,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
public void testAllocationWithoutAllowedGpus() throws Exception { public void testAllocationWithoutAllowedGpus() throws Exception {
Configuration conf = new YarnConfiguration(); Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " "); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
GpuDiscoverer.getInstance().initialize(conf); gpuDiscoverer.initialize(conf);
try { try {
gpuResourceHandler.bootstrap(conf); gpuResourceHandler.bootstrap(conf);
@ -315,7 +317,7 @@ public void testAllocationWithoutAllowedGpus() throws Exception {
public void testAllocationStored() throws Exception { public void testAllocationStored() throws Exception {
Configuration conf = new YarnConfiguration(); Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf); gpuDiscoverer.initialize(conf);
gpuResourceHandler.bootstrap(conf); gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4, Assert.assertEquals(4,
@ -361,9 +363,9 @@ public void testAllocationStoredWithNULLStateStore() throws Exception {
GpuResourceHandlerImpl gpuNULLStateResourceHandler = GpuResourceHandlerImpl gpuNULLStateResourceHandler =
new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler, new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
mockPrivilegedExecutor); mockPrivilegedExecutor, gpuDiscoverer);
GpuDiscoverer.getInstance().initialize(conf); gpuDiscoverer.initialize(conf);
gpuNULLStateResourceHandler.bootstrap(conf); gpuNULLStateResourceHandler.bootstrap(conf);
Assert.assertEquals(4, Assert.assertEquals(4,
@ -383,7 +385,7 @@ public void testAllocationStoredWithNULLStateStore() throws Exception {
public void testRecoverResourceAllocation() throws Exception { public void testRecoverResourceAllocation() throws Exception {
Configuration conf = new YarnConfiguration(); Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf); gpuDiscoverer.initialize(conf);
gpuResourceHandler.bootstrap(conf); gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4, Assert.assertEquals(4,