YARN-9133. Make tests more easy to comprehend in TestGpuResourceHandler. Contributed by Peter Bacsko
This commit is contained in:
parent
2432356570
commit
3e0410449f
@ -18,6 +18,8 @@
|
|||||||
|
|
||||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableMap;
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.util.StringUtils;
|
import org.apache.hadoop.util.StringUtils;
|
||||||
@ -28,6 +30,7 @@
|
|||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
|
||||||
@ -43,9 +46,10 @@
|
|||||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
|
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
|
||||||
import org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationProvider;
|
import org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationProvider;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.Assert;
|
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
|
import org.junit.Rule;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import org.junit.rules.ExpectedException;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
@ -53,11 +57,13 @@
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import static org.junit.Assert.fail;
|
||||||
import static org.mockito.ArgumentMatchers.any;
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
import static org.mockito.ArgumentMatchers.anyList;
|
import static org.mockito.ArgumentMatchers.anyList;
|
||||||
import static org.mockito.ArgumentMatchers.anyString;
|
import static org.mockito.ArgumentMatchers.anyString;
|
||||||
@ -65,7 +71,6 @@
|
|||||||
import static org.mockito.Mockito.doThrow;
|
import static org.mockito.Mockito.doThrow;
|
||||||
import static org.mockito.Mockito.mock;
|
import static org.mockito.Mockito.mock;
|
||||||
import static org.mockito.Mockito.never;
|
import static org.mockito.Mockito.never;
|
||||||
import static org.mockito.Mockito.times;
|
|
||||||
import static org.mockito.Mockito.verify;
|
import static org.mockito.Mockito.verify;
|
||||||
import static org.mockito.Mockito.when;
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
@ -103,11 +108,14 @@ private Configuration createDefaultConfig() throws IOException {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private File setupFakeGpuDiscoveryBinary() throws IOException {
|
private File setupFakeGpuDiscoveryBinary() throws IOException {
|
||||||
File fakeBinary = new File(getTestParentDirectory() + "/fake-nvidia-smi");
|
File fakeBinary = new File(getTestParentDirectory() + "/nvidia-smi");
|
||||||
touchFile(fakeBinary);
|
touchFile(fakeBinary);
|
||||||
return fakeBinary;
|
return fakeBinary;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Rule
|
||||||
|
public ExpectedException expected = ExpectedException.none();
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setup() throws IOException {
|
public void setup() throws IOException {
|
||||||
createTestDataDirectory();
|
createTestDataDirectory();
|
||||||
@ -120,16 +128,20 @@ public void setup() throws IOException {
|
|||||||
mockNMStateStore = mock(NMStateStoreService.class);
|
mockNMStateStore = mock(NMStateStoreService.class);
|
||||||
|
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
|
Context nmContext = createMockNmContext(conf);
|
||||||
|
|
||||||
|
gpuDiscoverer = new GpuDiscoverer();
|
||||||
|
gpuResourceHandler = new GpuResourceHandlerImpl(nmContext,
|
||||||
|
mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Context createMockNmContext(Configuration conf) {
|
||||||
Context nmctx = mock(Context.class);
|
Context nmctx = mock(Context.class);
|
||||||
when(nmctx.getNMStateStore()).thenReturn(mockNMStateStore);
|
when(nmctx.getNMStateStore()).thenReturn(mockNMStateStore);
|
||||||
when(nmctx.getConf()).thenReturn(conf);
|
when(nmctx.getConf()).thenReturn(conf);
|
||||||
runningContainersMap = new ConcurrentHashMap<>();
|
runningContainersMap = new ConcurrentHashMap<>();
|
||||||
when(nmctx.getContainers()).thenReturn(runningContainersMap);
|
when(nmctx.getContainers()).thenReturn(runningContainersMap);
|
||||||
|
return nmctx;
|
||||||
gpuDiscoverer = new GpuDiscoverer();
|
|
||||||
gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler,
|
|
||||||
mockPrivilegedExecutor, gpuDiscoverer);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@After
|
@After
|
||||||
@ -138,54 +150,63 @@ public void cleanupTestFiles() throws IOException {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testBootStrap() throws Exception {
|
public void testBootstrapWithRealGpuDiscoverer() throws Exception {
|
||||||
Configuration conf = createDefaultConfig();
|
Configuration conf = createDefaultConfig();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
|
||||||
|
|
||||||
gpuDiscoverer.initialize(conf);
|
gpuDiscoverer.initialize(conf);
|
||||||
|
|
||||||
gpuResourceHandler.bootstrap(conf);
|
gpuResourceHandler.bootstrap(conf);
|
||||||
verify(mockCGroupsHandler, times(1)).initializeCGroupController(
|
|
||||||
|
List<GpuDevice> allowedGpus =
|
||||||
|
gpuResourceHandler.getGpuAllocator().getAllowedGpusCopy();
|
||||||
|
assertEquals("Unexpected number of allowed GPU devices!", 1,
|
||||||
|
allowedGpus.size());
|
||||||
|
assertEquals("Expected GPU device does not equal to found device!",
|
||||||
|
new GpuDevice(0, 0), allowedGpus.get(0));
|
||||||
|
verify(mockCGroupsHandler).initializeCGroupController(
|
||||||
CGroupsHandler.CGroupController.DEVICES);
|
CGroupsHandler.CGroupController.DEVICES);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBootstrapWithMockGpuDiscoverer() throws Exception {
|
||||||
|
GpuDiscoverer mockDiscoverer = mock(GpuDiscoverer.class);
|
||||||
|
Configuration conf = new YarnConfiguration();
|
||||||
|
mockDiscoverer.initialize(conf);
|
||||||
|
|
||||||
|
expected.expect(ResourceHandlerException.class);
|
||||||
|
gpuResourceHandler.bootstrap(conf);
|
||||||
|
}
|
||||||
|
|
||||||
private static ContainerId getContainerId(int id) {
|
private static ContainerId getContainerId(int id) {
|
||||||
return ContainerId.newContainerId(ApplicationAttemptId
|
return ContainerId.newContainerId(ApplicationAttemptId
|
||||||
.newInstance(ApplicationId.newInstance(1234L, 1), 1), id);
|
.newInstance(ApplicationId.newInstance(1234L, 1), 1), id);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Container mockContainerWithGpuRequest(int id, int numGpuRequest,
|
private static Container mockContainerWithGpuRequest(int id, Resource res,
|
||||||
boolean dockerContainerEnabled) {
|
ContainerLaunchContext launchContext) {
|
||||||
Container c = mock(Container.class);
|
Container c = mock(Container.class);
|
||||||
when(c.getContainerId()).thenReturn(getContainerId(id));
|
when(c.getContainerId()).thenReturn(getContainerId(id));
|
||||||
|
|
||||||
Resource res = Resource.newInstance(1024, 1);
|
|
||||||
ResourceMappings resMapping = new ResourceMappings();
|
|
||||||
|
|
||||||
res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest);
|
|
||||||
when(c.getResource()).thenReturn(res);
|
when(c.getResource()).thenReturn(res);
|
||||||
when(c.getResourceMappings()).thenReturn(resMapping);
|
when(c.getResourceMappings()).thenReturn(new ResourceMappings());
|
||||||
|
when(c.getLaunchContext()).thenReturn(launchContext);
|
||||||
ContainerLaunchContext clc = mock(ContainerLaunchContext.class);
|
|
||||||
Map<String, String> env = new HashMap<>();
|
|
||||||
if (dockerContainerEnabled) {
|
|
||||||
env.put(ContainerRuntimeConstants.ENV_CONTAINER_TYPE,
|
|
||||||
ContainerRuntimeConstants.CONTAINER_RUNTIME_DOCKER);
|
|
||||||
}
|
|
||||||
when(clc.getEnvironment()).thenReturn(env);
|
|
||||||
when(c.getLaunchContext()).thenReturn(clc);
|
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static Resource createResourceRequest(int numGpuRequest) {
|
||||||
|
Resource res = Resource.newInstance(1024, 1);
|
||||||
|
res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
private static Container mockContainerWithGpuRequest(int id,
|
private static Container mockContainerWithGpuRequest(int id,
|
||||||
int numGpuRequest) {
|
Resource res) {
|
||||||
return mockContainerWithGpuRequest(id, numGpuRequest, false);
|
return mockContainerWithGpuRequest(id, res, createLaunchContext());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void verifyDeniedDevices(ContainerId containerId,
|
private void verifyDeniedDevices(ContainerId containerId,
|
||||||
List<GpuDevice> deniedDevices)
|
List<GpuDevice> deniedDevices)
|
||||||
throws ResourceHandlerException, PrivilegedOperationException {
|
throws ResourceHandlerException, PrivilegedOperationException {
|
||||||
verify(mockCGroupsHandler, times(1)).createCGroup(
|
verify(mockCGroupsHandler).createCGroup(
|
||||||
CGroupsHandler.CGroupController.DEVICES, containerId.toString());
|
CGroupsHandler.CGroupController.DEVICES, containerId.toString());
|
||||||
|
|
||||||
if (null != deniedDevices && !deniedDevices.isEmpty()) {
|
if (null != deniedDevices && !deniedDevices.isEmpty()) {
|
||||||
@ -193,7 +214,7 @@ private void verifyDeniedDevices(ContainerId containerId,
|
|||||||
for (GpuDevice deniedDevice : deniedDevices) {
|
for (GpuDevice deniedDevice : deniedDevices) {
|
||||||
deniedDevicesMinorNumber.add(deniedDevice.getMinorNumber());
|
deniedDevicesMinorNumber.add(deniedDevice.getMinorNumber());
|
||||||
}
|
}
|
||||||
verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation(
|
verify(mockPrivilegedExecutor).executePrivilegedOperation(
|
||||||
new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays
|
new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays
|
||||||
.asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION,
|
.asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION,
|
||||||
containerId.toString(),
|
containerId.toString(),
|
||||||
@ -202,104 +223,139 @@ private void verifyDeniedDevices(ContainerId containerId,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void commonTestAllocation(boolean dockerContainerEnabled)
|
private static ContainerLaunchContext createLaunchContextDocker() {
|
||||||
throws Exception {
|
ContainerLaunchContext launchContext = mock(ContainerLaunchContext.class);
|
||||||
|
ImmutableMap<String, String> env = ImmutableMap.<String, String>builder()
|
||||||
|
.put(ContainerRuntimeConstants.ENV_CONTAINER_TYPE,
|
||||||
|
ContainerRuntimeConstants.CONTAINER_RUNTIME_DOCKER)
|
||||||
|
.build();
|
||||||
|
when(launchContext.getEnvironment()).thenReturn(env);
|
||||||
|
return launchContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ContainerLaunchContext createLaunchContext() {
|
||||||
|
ContainerLaunchContext launchContext = mock(ContainerLaunchContext.class);
|
||||||
|
when(launchContext.getEnvironment()).thenReturn(Maps.newHashMap());
|
||||||
|
return launchContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void startContainerWithGpuRequestsDocker(int id, int gpus)
|
||||||
|
throws ResourceHandlerException {
|
||||||
|
gpuResourceHandler.preStart(
|
||||||
|
mockContainerWithGpuRequest(id, createResourceRequest(gpus),
|
||||||
|
createLaunchContextDocker()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void startContainerWithGpuRequests(int id, int gpus)
|
||||||
|
throws ResourceHandlerException {
|
||||||
|
gpuResourceHandler.preStart(
|
||||||
|
mockContainerWithGpuRequest(id, createResourceRequest(gpus),
|
||||||
|
createLaunchContext()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyNumberOfAvailableGpus(int expectedAvailable,
|
||||||
|
GpuResourceHandlerImpl resourceHandler) {
|
||||||
|
assertEquals("Unexpected number of available GPU devices!",
|
||||||
|
expectedAvailable,
|
||||||
|
resourceHandler.getGpuAllocator().getAvailableGpus());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyCgroupsDeletedForContainer(int i)
|
||||||
|
throws ResourceHandlerException {
|
||||||
|
verify(mockCGroupsHandler).createCGroup(
|
||||||
|
CGroupsHandler.CGroupController.DEVICES, getContainerId(i).toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initializeGpus() throws YarnException, IOException {
|
||||||
Configuration conf = createDefaultConfig();
|
Configuration conf = createDefaultConfig();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||||
|
|
||||||
|
gpuDiscoverer = new GpuDiscoverer();
|
||||||
gpuDiscoverer.initialize(conf);
|
gpuDiscoverer.initialize(conf);
|
||||||
|
Context nmContext = createMockNmContext(conf);
|
||||||
|
gpuResourceHandler = new GpuResourceHandlerImpl(nmContext,
|
||||||
|
mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer);
|
||||||
|
|
||||||
gpuResourceHandler.bootstrap(conf);
|
gpuResourceHandler.bootstrap(conf);
|
||||||
Assert.assertEquals(4,
|
verifyNumberOfAvailableGpus(4, gpuResourceHandler);
|
||||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
|
||||||
|
|
||||||
/* Start container 1, asks 3 containers */
|
|
||||||
gpuResourceHandler.preStart(
|
|
||||||
mockContainerWithGpuRequest(1, 3, dockerContainerEnabled));
|
|
||||||
|
|
||||||
// Only device=4 will be blocked.
|
|
||||||
if (dockerContainerEnabled) {
|
|
||||||
verifyDeniedDevices(getContainerId(1), Collections.emptyList());
|
|
||||||
} else{
|
|
||||||
verifyDeniedDevices(getContainerId(1), Arrays.asList(new GpuDevice(3,4)));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Start container 2, asks 2 containers. Excepted to fail */
|
|
||||||
boolean failedToAllocate = false;
|
|
||||||
try {
|
|
||||||
gpuResourceHandler.preStart(
|
|
||||||
mockContainerWithGpuRequest(2, 2, dockerContainerEnabled));
|
|
||||||
} catch (ResourceHandlerException e) {
|
|
||||||
failedToAllocate = true;
|
|
||||||
}
|
|
||||||
Assert.assertTrue(failedToAllocate);
|
|
||||||
|
|
||||||
/* Start container 3, ask 1 container, succeeded */
|
|
||||||
gpuResourceHandler.preStart(
|
|
||||||
mockContainerWithGpuRequest(3, 1, dockerContainerEnabled));
|
|
||||||
|
|
||||||
// devices = 0/1/3 will be blocked
|
|
||||||
if (dockerContainerEnabled) {
|
|
||||||
verifyDeniedDevices(getContainerId(3), Collections.emptyList());
|
|
||||||
} else {
|
|
||||||
verifyDeniedDevices(getContainerId(3), Arrays
|
|
||||||
.asList(new GpuDevice(0, 0), new GpuDevice(1, 1),
|
|
||||||
new GpuDevice(2, 3)));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* Start container 4, ask 0 container, succeeded */
|
|
||||||
gpuResourceHandler.preStart(
|
|
||||||
mockContainerWithGpuRequest(4, 0, dockerContainerEnabled));
|
|
||||||
|
|
||||||
if (dockerContainerEnabled) {
|
|
||||||
verifyDeniedDevices(getContainerId(4), Collections.emptyList());
|
|
||||||
} else{
|
|
||||||
// All devices will be blocked
|
|
||||||
verifyDeniedDevices(getContainerId(4), Arrays
|
|
||||||
.asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3),
|
|
||||||
new GpuDevice(3, 4)));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Release container-1, expect cgroups deleted */
|
|
||||||
gpuResourceHandler.postComplete(getContainerId(1));
|
|
||||||
|
|
||||||
verify(mockCGroupsHandler, times(1)).createCGroup(
|
|
||||||
CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
|
|
||||||
Assert.assertEquals(3,
|
|
||||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
|
||||||
|
|
||||||
/* Release container-3, expect cgroups deleted */
|
|
||||||
gpuResourceHandler.postComplete(getContainerId(3));
|
|
||||||
|
|
||||||
verify(mockCGroupsHandler, times(1)).createCGroup(
|
|
||||||
CGroupsHandler.CGroupController.DEVICES, getContainerId(3).toString());
|
|
||||||
Assert.assertEquals(4,
|
|
||||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testAllocationWhenDockerContainerEnabled() throws Exception {
|
public void testAllocationWhenDockerContainerEnabled() throws Exception {
|
||||||
// When docker container is enabled, no devices should be written to
|
// When docker container is enabled, no devices should be written to
|
||||||
// devices.deny.
|
// devices.deny.
|
||||||
commonTestAllocation(true);
|
initializeGpus();
|
||||||
|
|
||||||
|
startContainerWithGpuRequestsDocker(1, 3);
|
||||||
|
verifyDeniedDevices(getContainerId(1), Collections.emptyList());
|
||||||
|
|
||||||
|
/* Start container 2, asks 2 containers. Excepted to fail */
|
||||||
|
boolean failedToAllocate = false;
|
||||||
|
try {
|
||||||
|
startContainerWithGpuRequestsDocker(2, 2);
|
||||||
|
} catch (ResourceHandlerException e) {
|
||||||
|
failedToAllocate = true;
|
||||||
|
}
|
||||||
|
assertTrue("Container allocation is expected to fail!", failedToAllocate);
|
||||||
|
|
||||||
|
startContainerWithGpuRequestsDocker(3, 1);
|
||||||
|
verifyDeniedDevices(getContainerId(3), Collections.emptyList());
|
||||||
|
|
||||||
|
startContainerWithGpuRequestsDocker(4, 0);
|
||||||
|
verifyDeniedDevices(getContainerId(4), Collections.emptyList());
|
||||||
|
|
||||||
|
gpuResourceHandler.postComplete(getContainerId(1));
|
||||||
|
verifyCgroupsDeletedForContainer(1);
|
||||||
|
verifyNumberOfAvailableGpus(3, gpuResourceHandler);
|
||||||
|
|
||||||
|
gpuResourceHandler.postComplete(getContainerId(3));
|
||||||
|
verifyCgroupsDeletedForContainer(3);
|
||||||
|
verifyNumberOfAvailableGpus(4, gpuResourceHandler);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testAllocation() throws Exception {
|
public void testAllocation() throws Exception {
|
||||||
commonTestAllocation(false);
|
initializeGpus();
|
||||||
|
|
||||||
|
//Start container 1, asks 3 containers --> Only device=4 will be blocked.
|
||||||
|
startContainerWithGpuRequests(1, 3);
|
||||||
|
verifyDeniedDevices(getContainerId(1),
|
||||||
|
Collections.singletonList(new GpuDevice(3, 4)));
|
||||||
|
|
||||||
|
/* Start container 2, asks 2 containers. Excepted to fail */
|
||||||
|
boolean failedToAllocate = false;
|
||||||
|
try {
|
||||||
|
startContainerWithGpuRequests(2, 2);
|
||||||
|
} catch (ResourceHandlerException e) {
|
||||||
|
failedToAllocate = true;
|
||||||
|
}
|
||||||
|
assertTrue("Container allocation is expected to fail!", failedToAllocate);
|
||||||
|
|
||||||
|
// Start container 3, ask 1 container, succeeded
|
||||||
|
// devices = 0/1/3 will be blocked
|
||||||
|
startContainerWithGpuRequests(3, 1);
|
||||||
|
verifyDeniedDevices(getContainerId(3), Arrays.asList(new GpuDevice(0, 0),
|
||||||
|
new GpuDevice(1, 1), new GpuDevice(2, 3)));
|
||||||
|
|
||||||
|
// Start container 4, ask 0 container, succeeded
|
||||||
|
// --> All devices will be blocked
|
||||||
|
startContainerWithGpuRequests(4, 0);
|
||||||
|
verifyDeniedDevices(getContainerId(4), Arrays.asList(new GpuDevice(0, 0),
|
||||||
|
new GpuDevice(1, 1), new GpuDevice(2, 3), new GpuDevice(3, 4)));
|
||||||
|
|
||||||
|
gpuResourceHandler.postComplete(getContainerId(1));
|
||||||
|
verifyCgroupsDeletedForContainer(1);
|
||||||
|
verifyNumberOfAvailableGpus(3, gpuResourceHandler);
|
||||||
|
|
||||||
|
gpuResourceHandler.postComplete(getContainerId(3));
|
||||||
|
verifyCgroupsDeletedForContainer(3);
|
||||||
|
verifyNumberOfAvailableGpus(4, gpuResourceHandler);
|
||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
@Test
|
@Test
|
||||||
public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
|
public void testAssignedGpuWillBeCleanedUpWhenStoreOpFails()
|
||||||
throws Exception {
|
throws Exception {
|
||||||
Configuration conf = createDefaultConfig();
|
initializeGpus();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
|
||||||
gpuDiscoverer.initialize(conf);
|
|
||||||
|
|
||||||
gpuResourceHandler.bootstrap(conf);
|
|
||||||
Assert.assertEquals(4,
|
|
||||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
|
||||||
|
|
||||||
doThrow(new IOException("Exception ...")).when(mockNMStateStore)
|
doThrow(new IOException("Exception ...")).when(mockNMStateStore)
|
||||||
.storeAssignedResources(
|
.storeAssignedResources(
|
||||||
@ -308,16 +364,16 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
|
|||||||
boolean exception = false;
|
boolean exception = false;
|
||||||
/* Start container 1, asks 3 containers */
|
/* Start container 1, asks 3 containers */
|
||||||
try {
|
try {
|
||||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
|
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1,
|
||||||
|
createResourceRequest(3)));
|
||||||
} catch (ResourceHandlerException e) {
|
} catch (ResourceHandlerException e) {
|
||||||
exception = true;
|
exception = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
Assert.assertTrue("preStart should throw exception", exception);
|
assertTrue("preStart should throw exception", exception);
|
||||||
|
|
||||||
// After preStart, we still have 4 available GPU since the store op fails.
|
// After preStart, we still have 4 available GPU since the store op failed.
|
||||||
Assert.assertEquals(4,
|
verifyNumberOfAvailableGpus(4, gpuResourceHandler);
|
||||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@ -328,45 +384,40 @@ public void testAllocationWithoutAllowedGpus() throws Exception {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
gpuResourceHandler.bootstrap(conf);
|
gpuResourceHandler.bootstrap(conf);
|
||||||
Assert.fail("Should fail because no GPU available");
|
fail("Should fail because no GPU available");
|
||||||
} catch (ResourceHandlerException e) {
|
} catch (ResourceHandlerException e) {
|
||||||
// Expected because of no resource available
|
// Expected because of no resource available
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Start container 1, asks 0 containers */
|
/* Start container 1, asks 0 containers */
|
||||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0));
|
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1,
|
||||||
|
createResourceRequest(0)));
|
||||||
verifyDeniedDevices(getContainerId(1), Collections.emptyList());
|
verifyDeniedDevices(getContainerId(1), Collections.emptyList());
|
||||||
|
|
||||||
/* Start container 2, asks 1 containers. Excepted to fail */
|
/* Start container 2, asks 1 containers. Excepted to fail */
|
||||||
boolean failedToAllocate = false;
|
boolean failedToAllocate = false;
|
||||||
try {
|
try {
|
||||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 1));
|
gpuResourceHandler.preStart(mockContainerWithGpuRequest(2,
|
||||||
|
createResourceRequest(1)));
|
||||||
} catch (ResourceHandlerException e) {
|
} catch (ResourceHandlerException e) {
|
||||||
failedToAllocate = true;
|
failedToAllocate = true;
|
||||||
}
|
}
|
||||||
Assert.assertTrue(failedToAllocate);
|
assertTrue("Container allocation is expected to fail!", failedToAllocate);
|
||||||
|
|
||||||
/* Release container 1, expect cgroups deleted */
|
/* Release container 1, expect cgroups deleted */
|
||||||
gpuResourceHandler.postComplete(getContainerId(1));
|
gpuResourceHandler.postComplete(getContainerId(1));
|
||||||
|
|
||||||
verify(mockCGroupsHandler, times(1)).createCGroup(
|
verifyCgroupsDeletedForContainer(1);
|
||||||
CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
|
verifyNumberOfAvailableGpus(0, gpuResourceHandler);
|
||||||
Assert.assertEquals(0,
|
|
||||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testAllocationStored() throws Exception {
|
public void testAllocationStored() throws Exception {
|
||||||
Configuration conf = createDefaultConfig();
|
initializeGpus();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
|
||||||
gpuDiscoverer.initialize(conf);
|
|
||||||
|
|
||||||
gpuResourceHandler.bootstrap(conf);
|
|
||||||
Assert.assertEquals(4,
|
|
||||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
|
||||||
|
|
||||||
/* Start container 1, asks 3 containers */
|
/* Start container 1, asks 3 containers */
|
||||||
Container container = mockContainerWithGpuRequest(1, 3);
|
Container container = mockContainerWithGpuRequest(1,
|
||||||
|
createResourceRequest(3));
|
||||||
gpuResourceHandler.preStart(container);
|
gpuResourceHandler.preStart(container);
|
||||||
|
|
||||||
verify(mockNMStateStore).storeAssignedResources(container,
|
verify(mockNMStateStore).storeAssignedResources(container,
|
||||||
@ -375,16 +426,18 @@ public void testAllocationStored() throws Exception {
|
|||||||
new GpuDevice(2, 3)));
|
new GpuDevice(2, 3)));
|
||||||
|
|
||||||
// Only device=4 will be blocked.
|
// Only device=4 will be blocked.
|
||||||
verifyDeniedDevices(getContainerId(1), Arrays.asList(new GpuDevice(3, 4)));
|
verifyDeniedDevices(getContainerId(1),
|
||||||
|
Collections.singletonList(new GpuDevice(3, 4)));
|
||||||
|
|
||||||
/* Start container 2, ask 0 container, succeeded */
|
/* Start container 2, ask 0 container, succeeded */
|
||||||
container = mockContainerWithGpuRequest(2, 0);
|
container = mockContainerWithGpuRequest(2, createResourceRequest(0));
|
||||||
gpuResourceHandler.preStart(container);
|
gpuResourceHandler.preStart(container);
|
||||||
|
|
||||||
verifyDeniedDevices(getContainerId(2), Arrays
|
verifyDeniedDevices(getContainerId(2), Arrays
|
||||||
.asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3),
|
.asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3),
|
||||||
new GpuDevice(3, 4)));
|
new GpuDevice(3, 4)));
|
||||||
Assert.assertEquals(0, container.getResourceMappings()
|
assertEquals("Number of GPU device allocations is not the expected!", 0,
|
||||||
|
container.getResourceMappings()
|
||||||
.getAssignedResources(ResourceInformation.GPU_URI).size());
|
.getAssignedResources(ResourceInformation.GPU_URI).size());
|
||||||
|
|
||||||
// Store assigned resource will not be invoked.
|
// Store assigned resource will not be invoked.
|
||||||
@ -394,7 +447,8 @@ public void testAllocationStored() throws Exception {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testAllocationStoredWithNULLStateStore() throws Exception {
|
public void testAllocationStoredWithNULLStateStore() throws Exception {
|
||||||
NMNullStateStoreService mockNMNULLStateStore = mock(NMNullStateStoreService.class);
|
NMNullStateStoreService mockNMNULLStateStore =
|
||||||
|
mock(NMNullStateStoreService.class);
|
||||||
|
|
||||||
Configuration conf = createDefaultConfig();
|
Configuration conf = createDefaultConfig();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||||
@ -410,11 +464,11 @@ public void testAllocationStoredWithNULLStateStore() throws Exception {
|
|||||||
gpuDiscoverer.initialize(conf);
|
gpuDiscoverer.initialize(conf);
|
||||||
|
|
||||||
gpuNULLStateResourceHandler.bootstrap(conf);
|
gpuNULLStateResourceHandler.bootstrap(conf);
|
||||||
Assert.assertEquals(4,
|
verifyNumberOfAvailableGpus(4, gpuNULLStateResourceHandler);
|
||||||
gpuNULLStateResourceHandler.getGpuAllocator().getAvailableGpus());
|
|
||||||
|
|
||||||
/* Start container 1, asks 3 containers */
|
/* Start container 1, asks 3 containers */
|
||||||
Container container = mockContainerWithGpuRequest(1, 3);
|
Container container = mockContainerWithGpuRequest(1,
|
||||||
|
createResourceRequest(3));
|
||||||
gpuNULLStateResourceHandler.preStart(container);
|
gpuNULLStateResourceHandler.preStart(container);
|
||||||
|
|
||||||
verify(nmnctx.getNMStateStore()).storeAssignedResources(container,
|
verify(nmnctx.getNMStateStore()).storeAssignedResources(container,
|
||||||
@ -425,13 +479,7 @@ public void testAllocationStoredWithNULLStateStore() throws Exception {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRecoverResourceAllocation() throws Exception {
|
public void testRecoverResourceAllocation() throws Exception {
|
||||||
Configuration conf = createDefaultConfig();
|
initializeGpus();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
|
||||||
gpuDiscoverer.initialize(conf);
|
|
||||||
|
|
||||||
gpuResourceHandler.bootstrap(conf);
|
|
||||||
Assert.assertEquals(4,
|
|
||||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
|
||||||
|
|
||||||
Container nmContainer = mock(Container.class);
|
Container nmContainer = mock(Container.class);
|
||||||
ResourceMappings rmap = new ResourceMappings();
|
ResourceMappings rmap = new ResourceMappings();
|
||||||
@ -450,12 +498,14 @@ public void testRecoverResourceAllocation() throws Exception {
|
|||||||
|
|
||||||
Map<GpuDevice, ContainerId> deviceAllocationMapping =
|
Map<GpuDevice, ContainerId> deviceAllocationMapping =
|
||||||
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy();
|
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy();
|
||||||
Assert.assertEquals(2, deviceAllocationMapping.size());
|
assertEquals("Unexpected number of allocated GPU devices!", 2,
|
||||||
Assert.assertTrue(
|
deviceAllocationMapping.size());
|
||||||
|
assertTrue("Expected GPU device is not found in allocations!",
|
||||||
deviceAllocationMapping.keySet().contains(new GpuDevice(1, 1)));
|
deviceAllocationMapping.keySet().contains(new GpuDevice(1, 1)));
|
||||||
Assert.assertTrue(
|
assertTrue("Expected GPU device is not found in allocations!",
|
||||||
deviceAllocationMapping.keySet().contains(new GpuDevice(2, 3)));
|
deviceAllocationMapping.keySet().contains(new GpuDevice(2, 3)));
|
||||||
Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
|
assertEquals("GPU device is not assigned to the expected container!",
|
||||||
|
deviceAllocationMapping.get(new GpuDevice(1, 1)),
|
||||||
getContainerId(1));
|
getContainerId(1));
|
||||||
|
|
||||||
// TEST CASE
|
// TEST CASE
|
||||||
@ -477,17 +527,20 @@ public void testRecoverResourceAllocation() throws Exception {
|
|||||||
} catch (ResourceHandlerException e) {
|
} catch (ResourceHandlerException e) {
|
||||||
caughtException = true;
|
caughtException = true;
|
||||||
}
|
}
|
||||||
Assert.assertTrue(
|
assertTrue(
|
||||||
"Should fail since requested device Id is not in allowed list",
|
"Should fail since requested device Id is not in allowed list",
|
||||||
caughtException);
|
caughtException);
|
||||||
|
|
||||||
// Make sure internal state not changed.
|
// Make sure internal state not changed.
|
||||||
deviceAllocationMapping =
|
deviceAllocationMapping =
|
||||||
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy();
|
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy();
|
||||||
Assert.assertEquals(2, deviceAllocationMapping.size());
|
assertEquals("Unexpected number of allocated GPU devices!",
|
||||||
Assert.assertTrue(deviceAllocationMapping.keySet()
|
2, deviceAllocationMapping.size());
|
||||||
|
assertTrue("Expected GPU devices are not found in allocations!",
|
||||||
|
deviceAllocationMapping.keySet()
|
||||||
.containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))));
|
.containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))));
|
||||||
Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
|
assertEquals("GPU device is not assigned to the expected container!",
|
||||||
|
deviceAllocationMapping.get(new GpuDevice(1, 1)),
|
||||||
getContainerId(1));
|
getContainerId(1));
|
||||||
|
|
||||||
// TEST CASE
|
// TEST CASE
|
||||||
@ -509,17 +562,20 @@ public void testRecoverResourceAllocation() throws Exception {
|
|||||||
} catch (ResourceHandlerException e) {
|
} catch (ResourceHandlerException e) {
|
||||||
caughtException = true;
|
caughtException = true;
|
||||||
}
|
}
|
||||||
Assert.assertTrue(
|
assertTrue(
|
||||||
"Should fail since requested device Id is already assigned",
|
"Should fail since requested device Id is already assigned",
|
||||||
caughtException);
|
caughtException);
|
||||||
|
|
||||||
// Make sure internal state not changed.
|
// Make sure internal state not changed.
|
||||||
deviceAllocationMapping =
|
deviceAllocationMapping =
|
||||||
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy();
|
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy();
|
||||||
Assert.assertEquals(2, deviceAllocationMapping.size());
|
assertEquals("Unexpected number of allocated GPU devices!",
|
||||||
Assert.assertTrue(deviceAllocationMapping.keySet()
|
2, deviceAllocationMapping.size());
|
||||||
|
assertTrue("Expected GPU devices are not found in allocations!",
|
||||||
|
deviceAllocationMapping.keySet()
|
||||||
.containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))));
|
.containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))));
|
||||||
Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
|
assertEquals("GPU device is not assigned to the expected container!",
|
||||||
|
deviceAllocationMapping.get(new GpuDevice(1, 1)),
|
||||||
getContainerId(1));
|
getContainerId(1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user