From e8fa192f07b6f2e7a0b03813edca03c505a8ac1b Mon Sep 17 00:00:00 2001 From: Szilard Nemeth Date: Wed, 21 Aug 2019 16:44:22 +0200 Subject: [PATCH] YARN-9217. Nodemanager will fail to start if GPU is misconfigured on the node or GPU drivers missing. Contributed by Peter Bacsko --- .../hadoop/yarn/conf/YarnConfiguration.java | 14 ++++ .../src/main/resources/yarn-default.xml | 11 +++ .../resources/ResourcesExceptionUtil.java | 42 ++++++++++ .../resources/gpu/GpuResourceHandlerImpl.java | 5 +- .../resourceplugin/ResourcePluginManager.java | 6 +- .../resourceplugin/gpu/GpuDiscoverer.java | 83 ++++++++++--------- .../gpu/GpuNodeResourceUpdateHandler.java | 13 ++- .../resourceplugin/gpu/GpuResourcePlugin.java | 35 ++++++-- .../gpu/NvidiaBinaryHelper.java | 63 ++++++++++++++ .../gpu/TestGpuResourceHandlerImpl.java | 16 ++-- .../resourceplugin/gpu/TestGpuDiscoverer.java | 45 +++++----- 11 files changed, 256 insertions(+), 77 deletions(-) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 134b69871c..1e55fe383b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1647,6 +1647,20 @@ public class YarnConfiguration extends Configuration { public static final String NM_RESOURCE_PLUGINS = NM_PREFIX + "resource-plugins"; + + /** + * Specifies whether the initialization of the Node Manager should continue + * if a certain device (GPU, FPGA, etc) was not found in the system. If set + * to "true", then an exception will be thrown if a device is missing or + * an error occurred during discovery. + */ + @Private + public static final String NM_RESOURCE_PLUGINS_FAIL_FAST = + NM_RESOURCE_PLUGINS + ".fail-fast"; + + @Private + public static final boolean DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST = true; + /** * This setting controls if pluggable device plugin framework is enabled. * */ diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 4b93d1e18a..7a672dee6c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -3918,6 +3918,17 @@ + + + Specifies whether the initialization of the Node Manager should continue + if a certain device (GPU, FPGA, etc) was not found in the system. If set + to "true", then an exception will be thrown if a device is missing or + an error occurred during discovery. + + yarn.nodemanager.resource-plugins.fail-fast + + + Specify GPU devices which can be managed by YARN NodeManager, split by comma diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java new file mode 100644 index 0000000000..f270f42440 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; + +import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST; +import static org.apache.hadoop.yarn.conf.YarnConfiguration.NM_RESOURCE_PLUGINS_FAIL_FAST; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.exceptions.YarnException; + +/** + * Small utility class which only re-throws YarnException if + * NM_RESOURCE_PLUGINS_FAIL_FAST property is true. + * + */ +public final class ResourcesExceptionUtil { + private ResourcesExceptionUtil() {} + + public static void throwIfNecessary(YarnException e, Configuration conf) + throws YarnException { + if (conf.getBoolean(NM_RESOURCE_PLUGINS_FAIL_FAST, + DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST)) { + throw e; + } + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java index 71b041b618..c3545fcab9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java @@ -18,6 +18,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; +import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -75,7 +77,8 @@ public class GpuResourceHandlerImpl implements ResourceHandler { String message = "GPU is enabled on the NodeManager, but couldn't find " + "any usable GPU devices, please double check configuration!"; LOG.error(message); - throw new ResourceHandlerException(message); + throwIfNecessary(new ResourceHandlerException(message), + configuration); } } catch (YarnException e) { LOG.error("Exception when trying to get usable GPU device", e); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java index 84cdd7a475..0dfa33faac 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java @@ -72,7 +72,7 @@ public class ResourcePluginManager { Map pluginMap = Maps.newHashMap(); if (plugins != null) { - pluginMap = initializePlugins(context, plugins); + pluginMap = initializePlugins(conf, context, plugins); } // Try to load pluggable device plugins @@ -101,7 +101,7 @@ public class ResourcePluginManager { return plugins; } - private Map initializePlugins( + private Map initializePlugins(Configuration conf, Context context, String[] plugins) throws YarnException { Map pluginMap = Maps.newHashMap(); @@ -114,7 +114,7 @@ public class ResourcePluginManager { if (resourceName.equals(GPU_URI)) { final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer(); final GpuNodeResourceUpdateHandler updateHandler = - new GpuNodeResourceUpdateHandler(gpuDiscoverer); + new GpuNodeResourceUpdateHandler(gpuDiscoverer, conf); plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer); } else if (resourceName.equals(FPGA_URI)) { plugin = new FpgaResourcePlugin(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 939093f403..3f2b65769f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -18,6 +18,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; +import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary; + import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; @@ -26,7 +28,6 @@ import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; -import org.apache.hadoop.util.Shell; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; @@ -58,10 +59,9 @@ public class GpuDiscoverer extends Configured { private static final Set DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of( "/usr/bin", "/bin", "/usr/local/nvidia/bin"); - // command should not run more than 10 sec. - private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; private static final int MAX_REPEATED_ERROR_ALLOWED = 10; + private NvidiaBinaryHelper nvidiaBinaryHelper; private String pathOfGpuBinary = null; private Map environment = new HashMap<>(); @@ -110,24 +110,17 @@ public class GpuDiscoverer extends Configured { * @return GpuDeviceInformation * @throws YarnException when any error happens */ - synchronized GpuDeviceInformation getGpuDeviceInformation() + public synchronized GpuDeviceInformation getGpuDeviceInformation() throws YarnException { - validateConfOrThrowException(); - if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { String msg = getErrorMessageOfScriptExecutionThresholdReached(); LOG.error(msg); throw new YarnException(msg); } - String output; try { - output = Shell.execCommand(environment, - new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS); - GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); - lastDiscoveredGpuInformation = parser.parseXml(output); - numOfErrorExecutionSinceLastSucceed = 0; - return lastDiscoveredGpuInformation; + lastDiscoveredGpuInformation = + nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary); } catch (IOException e) { numOfErrorExecutionSinceLastSucceed++; String msg = getErrorMessageOfScriptExecution(e.getMessage()); @@ -136,14 +129,14 @@ public class GpuDiscoverer extends Configured { } catch (YarnException e) { numOfErrorExecutionSinceLastSucceed++; String msg = getFailedToParseErrorMessage(e.getMessage()); - if (LOG.isDebugEnabled()) { - LOG.warn(msg, e); - } + LOG.debug(msg, e); throw e; } + + return lastDiscoveredGpuInformation; } - private boolean IsAutoDiscoveryEnabled() { + private boolean isAutoDiscoveryEnabled() { String allowedDevicesStr = getConf().get( YarnConfiguration.NM_GPU_ALLOWED_DEVICES, YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES); @@ -155,13 +148,12 @@ public class GpuDiscoverer extends Configured { * Get list of GPU devices usable by YARN. * * @return List of GPU devices - * @throws YarnException when any issue happens */ public synchronized List getGpusUsableByYarn() throws YarnException { validateConfOrThrowException(); - if (IsAutoDiscoveryEnabled()) { + if (isAutoDiscoveryEnabled()) { return parseGpuDevicesFromAutoDiscoveredGpuInfo(); } else { if (gpuDevicesFromUser == null) { @@ -217,16 +209,27 @@ public class GpuDiscoverer extends Configured { if (device.trim().length() > 0) { String[] splitByColon = device.trim().split(":"); if (splitByColon.length != 2) { - throw GpuDeviceSpecificationException. - createWithWrongValueSpecified(device, devices); + throwIfNecessary(GpuDeviceSpecificationException + .createWithWrongValueSpecified(device, devices), getConf()); + LOG.warn("Wrong GPU specification string {}, ignored", device); + } + + GpuDevice gpuDevice; + try { + gpuDevice = parseGpuDevice(splitByColon); + } catch (NumberFormatException e) { + throwIfNecessary(GpuDeviceSpecificationException + .createWithWrongValueSpecified(device, devices, e), getConf()); + LOG.warn("Cannot parse GPU device numbers: {}", device); + continue; } - GpuDevice gpuDevice = parseGpuDevice(device, splitByColon, devices); if (!gpuDevices.contains(gpuDevice)) { gpuDevices.add(gpuDevice); } else { - throw GpuDeviceSpecificationException - .createWithDuplicateValueSpecified(device, devices); + throwIfNecessary(GpuDeviceSpecificationException + .createWithDuplicateValueSpecified(device, devices), getConf()); + LOG.warn("CPU device is duplicated: {}", device); } } } @@ -235,22 +238,17 @@ public class GpuDiscoverer extends Configured { return gpuDevices; } - private GpuDevice parseGpuDevice(String device, String[] splitByColon, - String allowedDevicesStr) throws YarnException { - try { - int index = Integer.parseInt(splitByColon[0]); - int minorNumber = Integer.parseInt(splitByColon[1]); - return new GpuDevice(index, minorNumber); - } catch (NumberFormatException e) { - throw GpuDeviceSpecificationException. - createWithWrongValueSpecified(device, allowedDevicesStr, e); - } + private GpuDevice parseGpuDevice(String[] splitByColon) { + int index = Integer.parseInt(splitByColon[0]); + int minorNumber = Integer.parseInt(splitByColon[1]); + return new GpuDevice(index, minorNumber); } - public synchronized void initialize(Configuration config) - throws YarnException { + public synchronized void initialize(Configuration config, + NvidiaBinaryHelper nvidiaHelper) throws YarnException { setConf(config); - if (IsAutoDiscoveryEnabled()) { + this.nvidiaBinaryHelper = nvidiaHelper; + if (isAutoDiscoveryEnabled()) { numOfErrorExecutionSinceLastSucceed = 0; lookUpAutoDiscoveryBinary(config); @@ -284,7 +282,18 @@ public class GpuDiscoverer extends Configured { binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile); } else { binaryPath = configuredBinaryFile; + // If path exists but file name is incorrect don't execute the file + String fileName = binaryPath.getName(); + if (DEFAULT_BINARY_NAME.equals(fileName)) { + String msg = String.format("Please check the configuration value of" + +" %s. It should point to an %s binary.", + YarnConfiguration.NM_GPU_PATH_TO_EXEC, + DEFAULT_BINARY_NAME); + throwIfNecessary(new YarnException(msg), config); + LOG.warn(msg); + } } + pathOfGpuBinary = binaryPath.getAbsolutePath(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java index 4b2258d557..afb0d7eda2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java @@ -18,6 +18,9 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; +import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary; + +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.ResourceInformation; import org.apache.hadoop.yarn.conf.YarnConfiguration; @@ -36,9 +39,12 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin { private static final Logger LOG = LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class); private final GpuDiscoverer gpuDiscoverer; + private Configuration conf; - public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) { + public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer, + Configuration conf) { this.gpuDiscoverer = gpuDiscoverer; + this.conf = conf; } @Override @@ -51,7 +57,8 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin { "but could not find any usable GPUs on the NodeManager!"; LOG.error(message); // No gpu can be used by YARN. - throw new YarnException(message); + throwIfNecessary(new YarnException(message), conf); + return; } long nUsableGpus = usableGpus.size(); @@ -59,7 +66,7 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin { Map configuredResourceTypes = ResourceUtils.getResourceTypes(); if (!configuredResourceTypes.containsKey(GPU_URI)) { - throw new YarnException("Found " + nUsableGpus + " usable GPUs, however " + LOG.warn("Found " + nUsableGpus + " usable GPUs, however " + GPU_URI + " resource-type is not configured inside" + " resource-types.xml, please configure it to enable GPU feature or" diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java index 2b06f31f37..d44160e827 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java @@ -18,6 +18,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; +import java.util.List; + import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.Context; @@ -32,8 +34,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo; - -import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -44,6 +44,10 @@ public class GpuResourcePlugin implements ResourcePlugin { private final GpuNodeResourceUpdateHandler resourceDiscoverHandler; private final GpuDiscoverer gpuDiscoverer; + public static final int MAX_REPEATED_ERROR_ALLOWED = 10; + + private int numOfErrorExecutionSinceLastSucceed = 0; + private GpuResourceHandlerImpl gpuResourceHandler = null; private DockerCommandPlugin dockerCommandPlugin = null; @@ -55,7 +59,8 @@ public class GpuResourcePlugin implements ResourcePlugin { @Override public void initialize(Context context) throws YarnException { - this.gpuDiscoverer.initialize(context.getConf()); + this.gpuDiscoverer.initialize(context.getConf(), + new NvidiaBinaryHelper()); this.dockerCommandPlugin = GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin( context.getConf()); @@ -89,12 +94,21 @@ public class GpuResourcePlugin implements ResourcePlugin { @Override public synchronized NMResourceInfo getNMResourceInfo() throws YarnException { - GpuDeviceInformation gpuDeviceInformation = - gpuDiscoverer.getGpuDeviceInformation(); + GpuDeviceInformation gpuDeviceInformation; //At this point the gpu plugin is already enabled checkGpuResourceHandler(); + checkErrorCount(); + try{ + gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation(); + numOfErrorExecutionSinceLastSucceed = 0; + } catch (YarnException e) { + LOG.error(e.getMessage(), e); + numOfErrorExecutionSinceLastSucceed++; + throw e; + } + GpuResourceAllocator gpuResourceAllocator = gpuResourceHandler.getGpuAllocator(); List totalGpus = gpuResourceAllocator.getAllowedGpus(); @@ -116,6 +130,17 @@ public class GpuResourcePlugin implements ResourcePlugin { } } + private void checkErrorCount() throws YarnException { + if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { + String msg = + "Failed to execute GPU device information detection script for " + + MAX_REPEATED_ERROR_ALLOWED + + " times, skip following executions."; + LOG.error(msg); + throw new YarnException(msg); + } + } + @Override public String toString() { return GpuResourcePlugin.class.getName(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java new file mode 100644 index 0000000000..8efc32a8b1 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import java.io.IOException; +import java.util.HashMap; + +import org.apache.hadoop.util.Shell; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser; + +/** + * Executes the "nvidia-smi" command and returns an object + * based on its output. + * + */ +public class NvidiaBinaryHelper { + /** + * command should not run more than 10 sec. + */ + private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; + + /** + * @param pathOfGpuBinary The path of the binary + * @return the GpuDeviceInformation parsed from the nvidia-smi output + * @throws IOException if the binary output is not readable + * @throws YarnException if the pathOfGpuBinary is null, + * or the output parse failed + */ + synchronized GpuDeviceInformation getGpuDeviceInformation( + String pathOfGpuBinary) throws IOException, YarnException { + GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); + + if (pathOfGpuBinary == null) { + throw new YarnException( + "Failed to find GPU discovery executable, please double check " + + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting."); + } + + String output = Shell.execCommand(new HashMap<>(), + new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS); + return parser.parseXml(output); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandlerImpl.java index 777a85ba24..7871179891 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandlerImpl.java @@ -41,6 +41,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resource import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.NvidiaBinaryHelper; import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; @@ -116,10 +117,12 @@ public class TestGpuResourceHandlerImpl { @Rule public ExpectedException expected = ExpectedException.none(); + private NvidiaBinaryHelper nvidiaBinaryHelper; + @Before public void setup() throws IOException { createTestDataDirectory(); - + nvidiaBinaryHelper = new NvidiaBinaryHelper(); CustomResourceTypesConfigurationProvider. initResourceTypes(ResourceInformation.GPU_URI); @@ -147,13 +150,14 @@ public class TestGpuResourceHandlerImpl { @After public void cleanupTestFiles() throws IOException { FileUtils.deleteDirectory(testDataDirectory); + nvidiaBinaryHelper = new NvidiaBinaryHelper(); } @Test public void testBootstrapWithRealGpuDiscoverer() throws Exception { Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0"); - gpuDiscoverer.initialize(conf); + gpuDiscoverer.initialize(conf, nvidiaBinaryHelper); gpuResourceHandler.bootstrap(conf); @@ -171,7 +175,7 @@ public class TestGpuResourceHandlerImpl { public void testBootstrapWithMockGpuDiscoverer() throws Exception { GpuDiscoverer mockDiscoverer = mock(GpuDiscoverer.class); Configuration conf = new YarnConfiguration(); - mockDiscoverer.initialize(conf); + mockDiscoverer.initialize(conf, nvidiaBinaryHelper); expected.expect(ResourceHandlerException.class); gpuResourceHandler.bootstrap(conf); @@ -271,7 +275,7 @@ public class TestGpuResourceHandlerImpl { conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); gpuDiscoverer = new GpuDiscoverer(); - gpuDiscoverer.initialize(conf); + gpuDiscoverer.initialize(conf, nvidiaBinaryHelper); Context nmContext = createMockNmContext(conf); gpuResourceHandler = new GpuResourceHandlerImpl(nmContext, mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer); @@ -380,7 +384,7 @@ public class TestGpuResourceHandlerImpl { public void testAllocationWithoutAllowedGpus() throws Exception { Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " "); - gpuDiscoverer.initialize(conf); + gpuDiscoverer.initialize(conf, nvidiaBinaryHelper); try { gpuResourceHandler.bootstrap(conf); @@ -461,7 +465,7 @@ public class TestGpuResourceHandlerImpl { new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer); - gpuDiscoverer.initialize(conf); + gpuDiscoverer.initialize(conf, nvidiaBinaryHelper); gpuNULLStateResourceHandler.bootstrap(conf); verifyNumberOfAvailableGpus(4, gpuNULLStateResourceHandler); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index a70e668146..6da238581e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -64,6 +64,7 @@ public class TestGpuDiscoverer { private static final String BASH_SHEBANG = "#!/bin/bash\n\n"; private static final String TEST_PARENT_DIR = new File("target/temp/" + TestGpuDiscoverer.class.getName()).getAbsolutePath(); + private NvidiaBinaryHelper binaryHelper = new NvidiaBinaryHelper(); @Rule public ExpectedException exception = ExpectedException.none(); @@ -150,7 +151,7 @@ public class TestGpuDiscoverer { Configuration conf) throws YarnException { conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, TEST_PARENT_DIR); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); return discoverer; } @@ -163,14 +164,14 @@ public class TestGpuDiscoverer { // test case 1, check default setting. Configuration conf = new Configuration(false); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); assertEquals(DEFAULT_BINARY_NAME, discoverer.getPathOfGpuBinary()); assertNvidiaIsOnPath(discoverer); // test case 2, check mandatory set path. File fakeBinary = setupFakeBinary(conf); discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); assertEquals(fakeBinary.getAbsolutePath(), discoverer.getPathOfGpuBinary()); assertNull(discoverer.getEnvironmentToRunCommand().get(PATH)); @@ -179,7 +180,7 @@ public class TestGpuDiscoverer { // but binary doesn't exist so default path will be used. fakeBinary.delete(); discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); assertEquals(DEFAULT_BINARY_NAME, discoverer.getPathOfGpuBinary()); assertNvidiaIsOnPath(discoverer); @@ -317,7 +318,7 @@ public class TestGpuDiscoverer { Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest"))); Configuration conf = new Configuration(false); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); GpuDeviceInformation info = discoverer.getGpuDeviceInformation(); assertTrue(info.getGpus().size() > 0); @@ -331,7 +332,7 @@ public class TestGpuDiscoverer { Configuration conf = createConfigWithAllowedDevices("1:2"); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); List usableGpuDevices = discoverer.getGpusUsableByYarn(); assertEquals(1, usableGpuDevices.size()); @@ -346,7 +347,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -354,7 +355,7 @@ public class TestGpuDiscoverer { public void testGetNumberOfUsableGpusFromConfig() throws YarnException { Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3:4"); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); List usableGpuDevices = discoverer.getGpusUsableByYarn(); assertEquals(4, usableGpuDevices.size()); @@ -379,7 +380,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -390,7 +391,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -401,7 +402,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -412,7 +413,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -423,7 +424,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -434,7 +435,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -445,7 +446,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -456,7 +457,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -467,7 +468,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -478,7 +479,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -488,7 +489,7 @@ public class TestGpuDiscoverer { conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla"); GpuDiscoverer plugin = new GpuDiscoverer(); try { - plugin.initialize(conf); + plugin.initialize(conf, binaryHelper); plugin.getGpusUsableByYarn(); fail("Illegal format, should fail."); } catch (YarnException e) { @@ -501,15 +502,15 @@ public class TestGpuDiscoverer { } @Test - public void testScriptNotCalled() throws YarnException { + public void testScriptNotCalled() throws YarnException, IOException { Configuration conf = new Configuration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3"); GpuDiscoverer gpuSpy = spy(GpuDiscoverer.class); - gpuSpy.initialize(conf); + gpuSpy.initialize(conf, binaryHelper); gpuSpy.getGpusUsableByYarn(); verify(gpuSpy, never()).getGpuDeviceInformation(); } -} +} \ No newline at end of file