YARN-9121. Replace GpuDiscoverer.getInstance() to a readable object for easy access control. Contributed by Szilard Nemeth.
This commit is contained in:
parent
92b1fdcece
commit
5e91ebd91a
@ -52,14 +52,17 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
|
||||
private final GpuResourceAllocator gpuAllocator;
|
||||
private final CGroupsHandler cGroupsHandler;
|
||||
private final PrivilegedOperationExecutor privilegedOperationExecutor;
|
||||
private final GpuDiscoverer gpuDiscoverer;
|
||||
|
||||
public GpuResourceHandlerImpl(Context nmContext,
|
||||
CGroupsHandler cGroupsHandler,
|
||||
PrivilegedOperationExecutor privilegedOperationExecutor) {
|
||||
PrivilegedOperationExecutor privilegedOperationExecutor,
|
||||
GpuDiscoverer gpuDiscoverer) {
|
||||
this.nmContext = nmContext;
|
||||
this.cGroupsHandler = cGroupsHandler;
|
||||
this.privilegedOperationExecutor = privilegedOperationExecutor;
|
||||
gpuAllocator = new GpuResourceAllocator(nmContext);
|
||||
this.gpuAllocator = new GpuResourceAllocator(nmContext);
|
||||
this.gpuDiscoverer = gpuDiscoverer;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -67,7 +70,7 @@ public List<PrivilegedOperation> bootstrap(Configuration configuration)
|
||||
throws ResourceHandlerException {
|
||||
List<GpuDevice> usableGpus;
|
||||
try {
|
||||
usableGpus = GpuDiscoverer.getInstance().getGpusUsableByYarn();
|
||||
usableGpus = gpuDiscoverer.getGpusUsableByYarn();
|
||||
if (usableGpus == null || usableGpus.isEmpty()) {
|
||||
String message = "GPU is enabled on the NodeManager, but couldn't find "
|
||||
+ "any usable GPU devices, please double check configuration!";
|
||||
|
@ -34,6 +34,8 @@
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DeviceMappingManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DevicePluginAdapter;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.FpgaResourcePlugin;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuNodeResourceUpdateHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin;
|
||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||
import org.slf4j.Logger;
|
||||
@ -96,7 +98,10 @@ public synchronized void initialize(Context context)
|
||||
|
||||
ResourcePlugin plugin = null;
|
||||
if (resourceName.equals(GPU_URI)) {
|
||||
plugin = new GpuResourcePlugin();
|
||||
final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer();
|
||||
final GpuNodeResourceUpdateHandler updateHandler =
|
||||
new GpuNodeResourceUpdateHandler(gpuDiscoverer);
|
||||
plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer);
|
||||
} else if (resourceName.equals(FPGA_URI)) {
|
||||
plugin = new FpgaResourcePlugin();
|
||||
}
|
||||
|
@ -58,11 +58,6 @@ public class GpuDiscoverer {
|
||||
// command should not run more than 10 sec.
|
||||
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
|
||||
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
|
||||
private static GpuDiscoverer instance;
|
||||
|
||||
static {
|
||||
instance = new GpuDiscoverer();
|
||||
}
|
||||
|
||||
private Configuration conf = null;
|
||||
private String pathOfGpuBinary = null;
|
||||
@ -293,8 +288,4 @@ Map<String, String> getEnvironmentToRunCommand() {
|
||||
String getPathOfGpuBinary() {
|
||||
return pathOfGpuBinary;
|
||||
}
|
||||
|
||||
public static GpuDiscoverer getInstance() {
|
||||
return instance;
|
||||
}
|
||||
}
|
||||
|
@ -35,16 +35,20 @@
|
||||
public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
|
||||
private final GpuDiscoverer gpuDiscoverer;
|
||||
|
||||
public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) {
|
||||
this.gpuDiscoverer = gpuDiscoverer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateConfiguredResource(Resource res) throws YarnException {
|
||||
LOG.info("Initializing configured GPU resources for the NodeManager.");
|
||||
|
||||
List<GpuDevice> usableGpus = GpuDiscoverer.getInstance()
|
||||
.getGpusUsableByYarn();
|
||||
List<GpuDevice> usableGpus = gpuDiscoverer.getGpusUsableByYarn();
|
||||
if (usableGpus == null || usableGpus.isEmpty()) {
|
||||
String message = "GPU is enabled, " +
|
||||
"but couldn't find any usable GPUs on the NodeManager!";
|
||||
"but could not find any usable GPUs on the NodeManager!";
|
||||
LOG.error(message);
|
||||
// No gpu can be used by YARN.
|
||||
throw new YarnException(message);
|
||||
|
@ -18,7 +18,6 @@
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
|
||||
@ -34,18 +33,23 @@
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class GpuResourcePlugin implements ResourcePlugin {
|
||||
private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
|
||||
private final GpuDiscoverer gpuDiscoverer;
|
||||
private GpuResourceHandlerImpl gpuResourceHandler = null;
|
||||
private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null;
|
||||
private DockerCommandPlugin dockerCommandPlugin = null;
|
||||
|
||||
public GpuResourcePlugin(GpuNodeResourceUpdateHandler resourceDiscoverHandler,
|
||||
GpuDiscoverer gpuDiscoverer) {
|
||||
this.resourceDiscoverHandler = resourceDiscoverHandler;
|
||||
this.gpuDiscoverer = gpuDiscoverer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void initialize(Context context) throws YarnException {
|
||||
resourceDiscoverHandler = new GpuNodeResourceUpdateHandler();
|
||||
GpuDiscoverer.getInstance().initialize(context.getConf());
|
||||
dockerCommandPlugin =
|
||||
this.gpuDiscoverer.initialize(context.getConf());
|
||||
this.dockerCommandPlugin =
|
||||
GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin(
|
||||
context.getConf());
|
||||
}
|
||||
@ -56,7 +60,7 @@ public synchronized ResourceHandler createResourceHandler(
|
||||
PrivilegedOperationExecutor privilegedOperationExecutor) {
|
||||
if (gpuResourceHandler == null) {
|
||||
gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler,
|
||||
privilegedOperationExecutor);
|
||||
privilegedOperationExecutor, gpuDiscoverer);
|
||||
}
|
||||
|
||||
return gpuResourceHandler;
|
||||
@ -77,9 +81,9 @@ public DockerCommandPlugin getDockerCommandPluginInstance() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public NMResourceInfo getNMResourceInfo() throws YarnException {
|
||||
public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
|
||||
GpuDeviceInformation gpuDeviceInformation =
|
||||
GpuDiscoverer.getInstance().getGpuDeviceInformation();
|
||||
gpuDiscoverer.getGpuDeviceInformation();
|
||||
GpuResourceAllocator gpuResourceAllocator =
|
||||
gpuResourceHandler.getGpuAllocator();
|
||||
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpusCopy();
|
||||
|
@ -71,6 +71,7 @@ public class TestGpuResourceHandler {
|
||||
private GpuResourceHandlerImpl gpuResourceHandler;
|
||||
private NMStateStoreService mockNMStateStore;
|
||||
private ConcurrentHashMap<ContainerId, Container> runningContainersMap;
|
||||
private GpuDiscoverer gpuDiscoverer;
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
@ -89,8 +90,9 @@ public void setup() {
|
||||
runningContainersMap = new ConcurrentHashMap<>();
|
||||
when(nmctx.getContainers()).thenReturn(runningContainersMap);
|
||||
|
||||
gpuDiscoverer = new GpuDiscoverer();
|
||||
gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler,
|
||||
mockPrivilegedExecutor);
|
||||
mockPrivilegedExecutor, gpuDiscoverer);
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -98,7 +100,7 @@ public void testBootStrap() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
|
||||
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
gpuDiscoverer.initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
verify(mockCGroupsHandler, times(1)).initializeCGroupController(
|
||||
@ -162,7 +164,7 @@ private void commonTestAllocation(boolean dockerContainerEnabled)
|
||||
throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
gpuDiscoverer.initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
Assert.assertEquals(4,
|
||||
@ -251,7 +253,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
|
||||
throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
gpuDiscoverer.initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
Assert.assertEquals(4,
|
||||
@ -280,7 +282,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
|
||||
public void testAllocationWithoutAllowedGpus() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
gpuDiscoverer.initialize(conf);
|
||||
|
||||
try {
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
@ -315,7 +317,7 @@ public void testAllocationWithoutAllowedGpus() throws Exception {
|
||||
public void testAllocationStored() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
gpuDiscoverer.initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
Assert.assertEquals(4,
|
||||
@ -361,9 +363,9 @@ public void testAllocationStoredWithNULLStateStore() throws Exception {
|
||||
|
||||
GpuResourceHandlerImpl gpuNULLStateResourceHandler =
|
||||
new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
|
||||
mockPrivilegedExecutor);
|
||||
mockPrivilegedExecutor, gpuDiscoverer);
|
||||
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
gpuDiscoverer.initialize(conf);
|
||||
|
||||
gpuNULLStateResourceHandler.bootstrap(conf);
|
||||
Assert.assertEquals(4,
|
||||
@ -383,7 +385,7 @@ public void testAllocationStoredWithNULLStateStore() throws Exception {
|
||||
public void testRecoverResourceAllocation() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
gpuDiscoverer.initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
Assert.assertEquals(4,
|
||||
|
Loading…
Reference in New Issue
Block a user