YARN-9121. Replace GpuDiscoverer.getInstance() to a readable object for easy access control. Contributed by Szilard Nemeth.
This commit is contained in:
parent
92b1fdcece
commit
5e91ebd91a
@ -52,14 +52,17 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
|
|||||||
private final GpuResourceAllocator gpuAllocator;
|
private final GpuResourceAllocator gpuAllocator;
|
||||||
private final CGroupsHandler cGroupsHandler;
|
private final CGroupsHandler cGroupsHandler;
|
||||||
private final PrivilegedOperationExecutor privilegedOperationExecutor;
|
private final PrivilegedOperationExecutor privilegedOperationExecutor;
|
||||||
|
private final GpuDiscoverer gpuDiscoverer;
|
||||||
|
|
||||||
public GpuResourceHandlerImpl(Context nmContext,
|
public GpuResourceHandlerImpl(Context nmContext,
|
||||||
CGroupsHandler cGroupsHandler,
|
CGroupsHandler cGroupsHandler,
|
||||||
PrivilegedOperationExecutor privilegedOperationExecutor) {
|
PrivilegedOperationExecutor privilegedOperationExecutor,
|
||||||
|
GpuDiscoverer gpuDiscoverer) {
|
||||||
this.nmContext = nmContext;
|
this.nmContext = nmContext;
|
||||||
this.cGroupsHandler = cGroupsHandler;
|
this.cGroupsHandler = cGroupsHandler;
|
||||||
this.privilegedOperationExecutor = privilegedOperationExecutor;
|
this.privilegedOperationExecutor = privilegedOperationExecutor;
|
||||||
gpuAllocator = new GpuResourceAllocator(nmContext);
|
this.gpuAllocator = new GpuResourceAllocator(nmContext);
|
||||||
|
this.gpuDiscoverer = gpuDiscoverer;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -67,7 +70,7 @@ public List<PrivilegedOperation> bootstrap(Configuration configuration)
|
|||||||
throws ResourceHandlerException {
|
throws ResourceHandlerException {
|
||||||
List<GpuDevice> usableGpus;
|
List<GpuDevice> usableGpus;
|
||||||
try {
|
try {
|
||||||
usableGpus = GpuDiscoverer.getInstance().getGpusUsableByYarn();
|
usableGpus = gpuDiscoverer.getGpusUsableByYarn();
|
||||||
if (usableGpus == null || usableGpus.isEmpty()) {
|
if (usableGpus == null || usableGpus.isEmpty()) {
|
||||||
String message = "GPU is enabled on the NodeManager, but couldn't find "
|
String message = "GPU is enabled on the NodeManager, but couldn't find "
|
||||||
+ "any usable GPU devices, please double check configuration!";
|
+ "any usable GPU devices, please double check configuration!";
|
||||||
|
@ -34,6 +34,8 @@
|
|||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DeviceMappingManager;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DeviceMappingManager;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DevicePluginAdapter;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.DevicePluginAdapter;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.FpgaResourcePlugin;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.FpgaResourcePlugin;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuNodeResourceUpdateHandler;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin;
|
||||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -96,7 +98,10 @@ public synchronized void initialize(Context context)
|
|||||||
|
|
||||||
ResourcePlugin plugin = null;
|
ResourcePlugin plugin = null;
|
||||||
if (resourceName.equals(GPU_URI)) {
|
if (resourceName.equals(GPU_URI)) {
|
||||||
plugin = new GpuResourcePlugin();
|
final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer();
|
||||||
|
final GpuNodeResourceUpdateHandler updateHandler =
|
||||||
|
new GpuNodeResourceUpdateHandler(gpuDiscoverer);
|
||||||
|
plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer);
|
||||||
} else if (resourceName.equals(FPGA_URI)) {
|
} else if (resourceName.equals(FPGA_URI)) {
|
||||||
plugin = new FpgaResourcePlugin();
|
plugin = new FpgaResourcePlugin();
|
||||||
}
|
}
|
||||||
|
@ -58,11 +58,6 @@ public class GpuDiscoverer {
|
|||||||
// command should not run more than 10 sec.
|
// command should not run more than 10 sec.
|
||||||
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
|
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
|
||||||
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
|
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
|
||||||
private static GpuDiscoverer instance;
|
|
||||||
|
|
||||||
static {
|
|
||||||
instance = new GpuDiscoverer();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Configuration conf = null;
|
private Configuration conf = null;
|
||||||
private String pathOfGpuBinary = null;
|
private String pathOfGpuBinary = null;
|
||||||
@ -293,8 +288,4 @@ Map<String, String> getEnvironmentToRunCommand() {
|
|||||||
String getPathOfGpuBinary() {
|
String getPathOfGpuBinary() {
|
||||||
return pathOfGpuBinary;
|
return pathOfGpuBinary;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static GpuDiscoverer getInstance() {
|
|
||||||
return instance;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -35,16 +35,20 @@
|
|||||||
public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
|
public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
|
||||||
private static final Logger LOG =
|
private static final Logger LOG =
|
||||||
LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
|
LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
|
||||||
|
private final GpuDiscoverer gpuDiscoverer;
|
||||||
|
|
||||||
|
public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) {
|
||||||
|
this.gpuDiscoverer = gpuDiscoverer;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void updateConfiguredResource(Resource res) throws YarnException {
|
public void updateConfiguredResource(Resource res) throws YarnException {
|
||||||
LOG.info("Initializing configured GPU resources for the NodeManager.");
|
LOG.info("Initializing configured GPU resources for the NodeManager.");
|
||||||
|
|
||||||
List<GpuDevice> usableGpus = GpuDiscoverer.getInstance()
|
List<GpuDevice> usableGpus = gpuDiscoverer.getGpusUsableByYarn();
|
||||||
.getGpusUsableByYarn();
|
|
||||||
if (usableGpus == null || usableGpus.isEmpty()) {
|
if (usableGpus == null || usableGpus.isEmpty()) {
|
||||||
String message = "GPU is enabled, " +
|
String message = "GPU is enabled, " +
|
||||||
"but couldn't find any usable GPUs on the NodeManager!";
|
"but could not find any usable GPUs on the NodeManager!";
|
||||||
LOG.error(message);
|
LOG.error(message);
|
||||||
// No gpu can be used by YARN.
|
// No gpu can be used by YARN.
|
||||||
throw new YarnException(message);
|
throw new YarnException(message);
|
||||||
|
@ -18,7 +18,6 @@
|
|||||||
|
|
||||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||||
|
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
|
||||||
@ -34,18 +33,23 @@
|
|||||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
|
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
public class GpuResourcePlugin implements ResourcePlugin {
|
public class GpuResourcePlugin implements ResourcePlugin {
|
||||||
|
private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
|
||||||
|
private final GpuDiscoverer gpuDiscoverer;
|
||||||
private GpuResourceHandlerImpl gpuResourceHandler = null;
|
private GpuResourceHandlerImpl gpuResourceHandler = null;
|
||||||
private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null;
|
|
||||||
private DockerCommandPlugin dockerCommandPlugin = null;
|
private DockerCommandPlugin dockerCommandPlugin = null;
|
||||||
|
|
||||||
|
public GpuResourcePlugin(GpuNodeResourceUpdateHandler resourceDiscoverHandler,
|
||||||
|
GpuDiscoverer gpuDiscoverer) {
|
||||||
|
this.resourceDiscoverHandler = resourceDiscoverHandler;
|
||||||
|
this.gpuDiscoverer = gpuDiscoverer;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public synchronized void initialize(Context context) throws YarnException {
|
public synchronized void initialize(Context context) throws YarnException {
|
||||||
resourceDiscoverHandler = new GpuNodeResourceUpdateHandler();
|
this.gpuDiscoverer.initialize(context.getConf());
|
||||||
GpuDiscoverer.getInstance().initialize(context.getConf());
|
this.dockerCommandPlugin =
|
||||||
dockerCommandPlugin =
|
|
||||||
GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin(
|
GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin(
|
||||||
context.getConf());
|
context.getConf());
|
||||||
}
|
}
|
||||||
@ -56,7 +60,7 @@ public synchronized ResourceHandler createResourceHandler(
|
|||||||
PrivilegedOperationExecutor privilegedOperationExecutor) {
|
PrivilegedOperationExecutor privilegedOperationExecutor) {
|
||||||
if (gpuResourceHandler == null) {
|
if (gpuResourceHandler == null) {
|
||||||
gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler,
|
gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler,
|
||||||
privilegedOperationExecutor);
|
privilegedOperationExecutor, gpuDiscoverer);
|
||||||
}
|
}
|
||||||
|
|
||||||
return gpuResourceHandler;
|
return gpuResourceHandler;
|
||||||
@ -77,9 +81,9 @@ public DockerCommandPlugin getDockerCommandPluginInstance() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public NMResourceInfo getNMResourceInfo() throws YarnException {
|
public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
|
||||||
GpuDeviceInformation gpuDeviceInformation =
|
GpuDeviceInformation gpuDeviceInformation =
|
||||||
GpuDiscoverer.getInstance().getGpuDeviceInformation();
|
gpuDiscoverer.getGpuDeviceInformation();
|
||||||
GpuResourceAllocator gpuResourceAllocator =
|
GpuResourceAllocator gpuResourceAllocator =
|
||||||
gpuResourceHandler.getGpuAllocator();
|
gpuResourceHandler.getGpuAllocator();
|
||||||
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpusCopy();
|
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpusCopy();
|
||||||
|
@ -71,6 +71,7 @@ public class TestGpuResourceHandler {
|
|||||||
private GpuResourceHandlerImpl gpuResourceHandler;
|
private GpuResourceHandlerImpl gpuResourceHandler;
|
||||||
private NMStateStoreService mockNMStateStore;
|
private NMStateStoreService mockNMStateStore;
|
||||||
private ConcurrentHashMap<ContainerId, Container> runningContainersMap;
|
private ConcurrentHashMap<ContainerId, Container> runningContainersMap;
|
||||||
|
private GpuDiscoverer gpuDiscoverer;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setup() {
|
public void setup() {
|
||||||
@ -89,8 +90,9 @@ public void setup() {
|
|||||||
runningContainersMap = new ConcurrentHashMap<>();
|
runningContainersMap = new ConcurrentHashMap<>();
|
||||||
when(nmctx.getContainers()).thenReturn(runningContainersMap);
|
when(nmctx.getContainers()).thenReturn(runningContainersMap);
|
||||||
|
|
||||||
|
gpuDiscoverer = new GpuDiscoverer();
|
||||||
gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler,
|
gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler,
|
||||||
mockPrivilegedExecutor);
|
mockPrivilegedExecutor, gpuDiscoverer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@ -98,7 +100,7 @@ public void testBootStrap() throws Exception {
|
|||||||
Configuration conf = new YarnConfiguration();
|
Configuration conf = new YarnConfiguration();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
|
||||||
|
|
||||||
GpuDiscoverer.getInstance().initialize(conf);
|
gpuDiscoverer.initialize(conf);
|
||||||
|
|
||||||
gpuResourceHandler.bootstrap(conf);
|
gpuResourceHandler.bootstrap(conf);
|
||||||
verify(mockCGroupsHandler, times(1)).initializeCGroupController(
|
verify(mockCGroupsHandler, times(1)).initializeCGroupController(
|
||||||
@ -162,7 +164,7 @@ private void commonTestAllocation(boolean dockerContainerEnabled)
|
|||||||
throws Exception {
|
throws Exception {
|
||||||
Configuration conf = new YarnConfiguration();
|
Configuration conf = new YarnConfiguration();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||||
GpuDiscoverer.getInstance().initialize(conf);
|
gpuDiscoverer.initialize(conf);
|
||||||
|
|
||||||
gpuResourceHandler.bootstrap(conf);
|
gpuResourceHandler.bootstrap(conf);
|
||||||
Assert.assertEquals(4,
|
Assert.assertEquals(4,
|
||||||
@ -251,7 +253,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
|
|||||||
throws Exception {
|
throws Exception {
|
||||||
Configuration conf = new YarnConfiguration();
|
Configuration conf = new YarnConfiguration();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||||
GpuDiscoverer.getInstance().initialize(conf);
|
gpuDiscoverer.initialize(conf);
|
||||||
|
|
||||||
gpuResourceHandler.bootstrap(conf);
|
gpuResourceHandler.bootstrap(conf);
|
||||||
Assert.assertEquals(4,
|
Assert.assertEquals(4,
|
||||||
@ -280,7 +282,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
|
|||||||
public void testAllocationWithoutAllowedGpus() throws Exception {
|
public void testAllocationWithoutAllowedGpus() throws Exception {
|
||||||
Configuration conf = new YarnConfiguration();
|
Configuration conf = new YarnConfiguration();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
|
||||||
GpuDiscoverer.getInstance().initialize(conf);
|
gpuDiscoverer.initialize(conf);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
gpuResourceHandler.bootstrap(conf);
|
gpuResourceHandler.bootstrap(conf);
|
||||||
@ -315,7 +317,7 @@ public void testAllocationWithoutAllowedGpus() throws Exception {
|
|||||||
public void testAllocationStored() throws Exception {
|
public void testAllocationStored() throws Exception {
|
||||||
Configuration conf = new YarnConfiguration();
|
Configuration conf = new YarnConfiguration();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||||
GpuDiscoverer.getInstance().initialize(conf);
|
gpuDiscoverer.initialize(conf);
|
||||||
|
|
||||||
gpuResourceHandler.bootstrap(conf);
|
gpuResourceHandler.bootstrap(conf);
|
||||||
Assert.assertEquals(4,
|
Assert.assertEquals(4,
|
||||||
@ -361,9 +363,9 @@ public void testAllocationStoredWithNULLStateStore() throws Exception {
|
|||||||
|
|
||||||
GpuResourceHandlerImpl gpuNULLStateResourceHandler =
|
GpuResourceHandlerImpl gpuNULLStateResourceHandler =
|
||||||
new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
|
new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
|
||||||
mockPrivilegedExecutor);
|
mockPrivilegedExecutor, gpuDiscoverer);
|
||||||
|
|
||||||
GpuDiscoverer.getInstance().initialize(conf);
|
gpuDiscoverer.initialize(conf);
|
||||||
|
|
||||||
gpuNULLStateResourceHandler.bootstrap(conf);
|
gpuNULLStateResourceHandler.bootstrap(conf);
|
||||||
Assert.assertEquals(4,
|
Assert.assertEquals(4,
|
||||||
@ -383,7 +385,7 @@ public void testAllocationStoredWithNULLStateStore() throws Exception {
|
|||||||
public void testRecoverResourceAllocation() throws Exception {
|
public void testRecoverResourceAllocation() throws Exception {
|
||||||
Configuration conf = new YarnConfiguration();
|
Configuration conf = new YarnConfiguration();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||||
GpuDiscoverer.getInstance().initialize(conf);
|
gpuDiscoverer.initialize(conf);
|
||||||
|
|
||||||
gpuResourceHandler.bootstrap(conf);
|
gpuResourceHandler.bootstrap(conf);
|
||||||
Assert.assertEquals(4,
|
Assert.assertEquals(4,
|
||||||
|
Loading…
Reference in New Issue
Block a user