YARN-9235. If linux container executor is not set for a GPU cluster GpuResourceHandlerImpl is not initialized and NPE is thrown. Contributed by Antal Balint Steinbach, Adam Antal
This commit is contained in:
parent
190e4349d7
commit
c416284bb7
@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
|
||||||
@ -33,8 +34,14 @@
|
|||||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
|
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
public class GpuResourcePlugin implements ResourcePlugin {
|
public class GpuResourcePlugin implements ResourcePlugin {
|
||||||
|
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(GpuResourcePlugin.class);
|
||||||
|
|
||||||
private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
|
private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
|
||||||
private final GpuDiscoverer gpuDiscoverer;
|
private final GpuDiscoverer gpuDiscoverer;
|
||||||
private GpuResourceHandlerImpl gpuResourceHandler = null;
|
private GpuResourceHandlerImpl gpuResourceHandler = null;
|
||||||
@ -84,6 +91,10 @@ public DockerCommandPlugin getDockerCommandPluginInstance() {
|
|||||||
public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
|
public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
|
||||||
GpuDeviceInformation gpuDeviceInformation =
|
GpuDeviceInformation gpuDeviceInformation =
|
||||||
gpuDiscoverer.getGpuDeviceInformation();
|
gpuDiscoverer.getGpuDeviceInformation();
|
||||||
|
|
||||||
|
//At this point the gpu plugin is already enabled
|
||||||
|
checkGpuResourceHandler();
|
||||||
|
|
||||||
GpuResourceAllocator gpuResourceAllocator =
|
GpuResourceAllocator gpuResourceAllocator =
|
||||||
gpuResourceHandler.getGpuAllocator();
|
gpuResourceHandler.getGpuAllocator();
|
||||||
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpusCopy();
|
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpusCopy();
|
||||||
@ -94,6 +105,17 @@ public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
|
|||||||
assignedGpuDevices);
|
assignedGpuDevices);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void checkGpuResourceHandler() throws YarnException {
|
||||||
|
if(gpuResourceHandler == null) {
|
||||||
|
String errorMsg =
|
||||||
|
"Linux Container Executor is not configured for the NodeManager. "
|
||||||
|
+ "To fully enable GPU feature on the node also set "
|
||||||
|
+ YarnConfiguration.NM_CONTAINER_EXECUTOR + " properly.";
|
||||||
|
LOG.warn(errorMsg);
|
||||||
|
throw new YarnException(errorMsg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return GpuResourcePlugin.class.getName();
|
return GpuResourcePlugin.class.getName();
|
||||||
|
@ -0,0 +1,54 @@
|
|||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||||
|
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class TestGpuResourcePlugin {
|
||||||
|
|
||||||
|
@Test(expected = YarnException.class)
|
||||||
|
public void testResourceHandlerNotInitialized() throws YarnException {
|
||||||
|
GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class);
|
||||||
|
GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler =
|
||||||
|
mock(GpuNodeResourceUpdateHandler.class);
|
||||||
|
|
||||||
|
GpuResourcePlugin target =
|
||||||
|
new GpuResourcePlugin(gpuNodeResourceUpdateHandler, gpuDiscoverer);
|
||||||
|
|
||||||
|
target.getNMResourceInfo();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testResourceHandlerIsInitialized() throws YarnException {
|
||||||
|
GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class);
|
||||||
|
GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler =
|
||||||
|
mock(GpuNodeResourceUpdateHandler.class);
|
||||||
|
|
||||||
|
GpuResourcePlugin target =
|
||||||
|
new GpuResourcePlugin(gpuNodeResourceUpdateHandler, gpuDiscoverer);
|
||||||
|
|
||||||
|
target.createResourceHandler(null, null, null);
|
||||||
|
|
||||||
|
//Not throwing any exception
|
||||||
|
target.getNMResourceInfo();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user