YARN-9337. GPU auto-discovery script runs even when the resource is given by hand. Contributed by Adam Antal

This commit is contained in:
Szilard Nemeth 2019-07-12 17:28:14 +02:00
parent 8b3c6791b1
commit 61b0c2bb7c
2 changed files with 52 additions and 25 deletions

View File

@ -69,6 +69,8 @@ public class GpuDiscoverer {
private int numOfErrorExecutionSinceLastSucceed = 0; private int numOfErrorExecutionSinceLastSucceed = 0;
private GpuDeviceInformation lastDiscoveredGpuInformation = null; private GpuDeviceInformation lastDiscoveredGpuInformation = null;
private List<GpuDevice> gpuDevicesFromUser;
private void validateConfOrThrowException() throws YarnException { private void validateConfOrThrowException() throws YarnException {
if (conf == null) { if (conf == null) {
throw new YarnException("Please initialize (call initialize) before use " throw new YarnException("Please initialize (call initialize) before use "
@ -141,6 +143,14 @@ public class GpuDiscoverer {
} }
} }
private boolean IsAutoDiscoveryEnabled() {
String allowedDevicesStr = conf.get(
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
return allowedDevicesStr.equals(
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
}
/** /**
* Get list of GPU devices usable by YARN. * Get list of GPU devices usable by YARN.
* *
@ -151,15 +161,13 @@ public class GpuDiscoverer {
throws YarnException { throws YarnException {
validateConfOrThrowException(); validateConfOrThrowException();
String allowedDevicesStr = conf.get( if (IsAutoDiscoveryEnabled()) {
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
if (allowedDevicesStr.equals(
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
return parseGpuDevicesFromAutoDiscoveredGpuInfo(); return parseGpuDevicesFromAutoDiscoveredGpuInfo();
} else { } else {
return parseGpuDevicesFromUserDefinedValues(allowedDevicesStr); if (gpuDevicesFromUser == null) {
gpuDevicesFromUser = parseGpuDevicesFromUserDefinedValues();
}
return gpuDevicesFromUser;
} }
} }
@ -191,16 +199,16 @@ public class GpuDiscoverer {
} }
/** /**
* @param devices allowed devices coming from the config.
* Individual devices should be separated by commas.
* <br>The format of individual devices should be:
* &lt;index:&gt;&lt;minorNumber&gt;
* @return List of GpuDevices * @return List of GpuDevices
* @throws YarnException when a GPU device is defined as a duplicate. * @throws YarnException when a GPU device is defined as a duplicate.
* The first duplicate GPU device will be added to the exception message. * The first duplicate GPU device will be added to the exception message.
*/ */
private List<GpuDevice> parseGpuDevicesFromUserDefinedValues(String devices) private List<GpuDevice> parseGpuDevicesFromUserDefinedValues()
throws YarnException { throws YarnException {
String devices = conf.get(
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
if (devices.trim().isEmpty()) { if (devices.trim().isEmpty()) {
throw GpuDeviceSpecificationException.createWithEmptyValueSpecified(); throw GpuDeviceSpecificationException.createWithEmptyValueSpecified();
} }
@ -242,19 +250,21 @@ public class GpuDiscoverer {
public synchronized void initialize(Configuration config) public synchronized void initialize(Configuration config)
throws YarnException { throws YarnException {
this.conf = config; this.conf = config;
numOfErrorExecutionSinceLastSucceed = 0; if (IsAutoDiscoveryEnabled()) {
lookUpAutoDiscoveryBinary(config); numOfErrorExecutionSinceLastSucceed = 0;
lookUpAutoDiscoveryBinary(config);
// Try to discover GPU information once and print // Try to discover GPU information once and print
try { try {
LOG.info("Trying to discover GPU information ..."); LOG.info("Trying to discover GPU information ...");
GpuDeviceInformation info = getGpuDeviceInformation(); GpuDeviceInformation info = getGpuDeviceInformation();
LOG.info("Discovered GPU information: " + info.toString()); LOG.info("Discovered GPU information: " + info.toString());
} catch (YarnException e) { } catch (YarnException e) {
String msg = String msg =
"Failed to discover GPU information from system, exception message:" "Failed to discover GPU information from system, exception message:"
+ e.getMessage() + " continue..."; + e.getMessage() + " continue...";
LOG.warn(msg); LOG.warn(msg);
}
} }
} }

View File

@ -40,6 +40,7 @@ import java.util.List;
import java.util.function.Consumer; import java.util.function.Consumer;
import static org.apache.hadoop.test.PlatformAssumptions.assumeNotWindows; import static org.apache.hadoop.test.PlatformAssumptions.assumeNotWindows;
import static org.apache.hadoop.yarn.conf.YarnConfiguration.NM_GPU_ALLOWED_DEVICES;
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer.DEFAULT_BINARY_NAME; import static org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer.DEFAULT_BINARY_NAME;
import static org.hamcrest.CoreMatchers.containsString; import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.CoreMatchers.not; import static org.hamcrest.CoreMatchers.not;
@ -49,6 +50,9 @@ import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertThat; import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail; import static org.junit.Assert.fail;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.verify;
public class TestGpuDiscoverer { public class TestGpuDiscoverer {
private static final Logger LOG = LoggerFactory.getLogger( private static final Logger LOG = LoggerFactory.getLogger(
@ -96,7 +100,7 @@ public class TestGpuDiscoverer {
private Configuration createConfigWithAllowedDevices(String s) { private Configuration createConfigWithAllowedDevices(String s) {
Configuration conf = new Configuration(false); Configuration conf = new Configuration(false);
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, s); conf.set(NM_GPU_ALLOWED_DEVICES, s);
setupFakeBinary(conf); setupFakeBinary(conf);
return conf; return conf;
} }
@ -495,4 +499,17 @@ public class TestGpuDiscoverer {
"executable in the default directories:")); "executable in the default directories:"));
} }
} }
@Test
public void testScriptNotCalled() throws YarnException {
Configuration conf = new Configuration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3");
GpuDiscoverer gpuSpy = spy(GpuDiscoverer.class);
gpuSpy.initialize(conf);
gpuSpy.getGpusUsableByYarn();
verify(gpuSpy, never()).getGpuDeviceInformation();
}
} }