diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 1b44f4291c..fa75eb4025 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1715,6 +1715,15 @@ public static boolean isAclEnabled(Configuration conf) { public static final String DEFAULT_NM_FPGA_VENDOR_PLUGIN = "org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.IntelFpgaOpenclPlugin"; + @Private + public static final String NM_FPGA_DEVICE_DISCOVERY_SCRIPT = + NM_FPGA_RESOURCE_PREFIX + "device-discovery-script"; + + @Private + public static final String NM_FPGA_AVAILABLE_DEVICES = + NM_FPGA_RESOURCE_PREFIX + "available-devices"; + + public static final String NM_NETWORK_TAG_PREFIX = NM_PREFIX + "network-tagging"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 1a5c35a84b..99105c8aeb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -3960,6 +3960,33 @@ auto + + + Absolute path to a script or executable that returns the available FPGA cards. + The returned string must be a single line and follow the format: + "deviceA/N:M,deviceB/X:Y". Example: "acl0/243:0,acl1/243:1". The numbers after + the "/" character are the device major and minor numbers. + + When the script is enabled, auto-discovery is disabled the "aocl" command is not + invoked to verify the available cards. + + yarn.nodemanager.resource-plugins.fpga.device-discovery-script + + + + + + List of FPGA available devices in the given node. + The value must follow the format: "deviceA/N:M,deviceB/X:Y". + Example: "acl0/243:0,acl1/243:1". The numbers after + the "/" character are the device major and minor numbers. + + When this property is used, both auto-discovery and external script are ignored. + + yarn.nodemanager.resource-plugins.fpga.available-devices + + + The http address of the timeline reader web application. yarn.timeline-service.reader.webapp.address diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/FpgaDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/FpgaDiscoverer.java index 8d32a18bf7..32b88b2dbc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/FpgaDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/FpgaDiscoverer.java @@ -19,21 +19,34 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga; -import com.google.common.annotations.VisibleForTesting; +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; + import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.util.Shell.ShellCommandExecutor; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.fpga.FpgaResourceAllocator; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery.AoclOutputBasedDiscoveryStrategy; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery.FPGADiscoveryStrategy; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery.ScriptBasedFPGADiscoveryStrategy; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery.SettingsBasedFPGADiscoveryStrategy; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Iterator; -import java.util.List; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; public class FpgaDiscoverer { - - public static final Logger LOG = LoggerFactory.getLogger( + private static final Logger LOG = LoggerFactory.getLogger( FpgaDiscoverer.class); private static FpgaDiscoverer instance; @@ -44,8 +57,10 @@ public class FpgaDiscoverer { private List currentFpgaInfo = null; + private Function> scriptRunner = this::runScript; + // shell command timeout - private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; + public static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; static { instance = new FpgaDiscoverer(); @@ -56,31 +71,41 @@ public static FpgaDiscoverer getInstance() { } @VisibleForTesting - public synchronized static FpgaDiscoverer setInstance(FpgaDiscoverer newInstance) { + void setScriptRunner(Function> scriptRunner) { + this.scriptRunner = scriptRunner; + } + + @VisibleForTesting + static void reset() { + instance = new FpgaDiscoverer(); + } + + @VisibleForTesting + public static FpgaDiscoverer setInstance(FpgaDiscoverer newInstance) { instance = newInstance; return instance; } @VisibleForTesting - public synchronized void setConf(Configuration conf) { - this.conf = conf; + public void setConf(Configuration configuration) { + this.conf = configuration; } public List getCurrentFpgaInfo() { return currentFpgaInfo; } - public synchronized void setResourceHanderPlugin(AbstractFpgaVendorPlugin plugin) { - this.plugin = plugin; + public void setResourceHanderPlugin(AbstractFpgaVendorPlugin vendorPlugin) { + this.plugin = vendorPlugin; } - public synchronized boolean diagnose() { + public boolean diagnose() { return this.plugin.diagnose(MAX_EXEC_TIMEOUT_MS); } - public synchronized void initialize(Configuration conf) throws YarnException { - this.conf = conf; - this.plugin.initPlugin(conf); + public void initialize(Configuration config) throws YarnException { + this.conf = config; + this.plugin.initPlugin(config); // Try to diagnose FPGA LOG.info("Trying to diagnose FPGA information ..."); if (!diagnose()) { @@ -91,40 +116,45 @@ public synchronized void initialize(Configuration conf) throws YarnException { /** * get avialable devices minor numbers from toolchain or static configuration * */ - public synchronized List discover() throws ResourceHandlerException { + public List discover() + throws ResourceHandlerException { List list; String allowed = this.conf.get(YarnConfiguration.NM_FPGA_ALLOWED_DEVICES); - // whatever static or auto discover, we always needs - // the vendor plugin to discover. For instance, IntelFpgaOpenclPlugin need to - // setup a mapping of to - list = this.plugin.discover(MAX_EXEC_TIMEOUT_MS); - if (0 == list.size()) { - throw new ResourceHandlerException("No FPGA devices detected!"); + + String availableDevices = conf.get( + YarnConfiguration.NM_FPGA_AVAILABLE_DEVICES); + String discoveryScript = conf.get( + YarnConfiguration.NM_FPGA_DEVICE_DISCOVERY_SCRIPT); + + FPGADiscoveryStrategy discoveryStrategy; + if (availableDevices != null) { + discoveryStrategy = + new SettingsBasedFPGADiscoveryStrategy( + plugin.getFpgaType(), availableDevices); + } else if (discoveryScript != null) { + discoveryStrategy = + new ScriptBasedFPGADiscoveryStrategy( + plugin.getFpgaType(), scriptRunner, discoveryScript); + } else { + discoveryStrategy = new AoclOutputBasedDiscoveryStrategy(plugin); } - currentFpgaInfo = list; - if (allowed.equalsIgnoreCase( + + list = discoveryStrategy.discover(); + + if (allowed == null || allowed.equalsIgnoreCase( YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) { - return list; + return list; } else if (allowed.matches("(\\d,)*\\d")){ - String[] minors = allowed.split(","); - Iterator iterator = list.iterator(); - // remove the non-configured minor numbers - FpgaResourceAllocator.FpgaDevice t; - while (iterator.hasNext()) { - boolean valid = false; - t = iterator.next(); - for (String minorNumber : minors) { - if (t.getMinor().toString().equals(minorNumber)) { - valid = true; - break; - } - } - if (!valid) { - iterator.remove(); - } - } + Set minors = Sets.newHashSet(allowed.split(",")); + + // Replace list with a filtered one + list = list + .stream() + .filter(dev -> minors.contains(dev.getMinor().toString())) + .collect(Collectors.toList()); + // if the count of user configured is still larger than actual - if (list.size() != minors.length) { + if (list.size() != minors.size()) { LOG.warn("We continue although there're mistakes in user's configuration " + YarnConfiguration.NM_FPGA_ALLOWED_DEVICES + "user configured:" + allowed + ", while the real:" + list.toString()); @@ -133,7 +163,41 @@ public synchronized List discover() throws Res throw new ResourceHandlerException("Invalid value configured for " + YarnConfiguration.NM_FPGA_ALLOWED_DEVICES + ":\"" + allowed + "\""); } + + currentFpgaInfo = ImmutableList.copyOf(list); + return list; } + private Optional runScript(String path) { + if (path == null || path.trim().isEmpty()) { + LOG.error("Undefined script"); + return Optional.empty(); + } + + File f = new File(path); + if (!f.exists()) { + LOG.error("Script does not exist"); + return Optional.empty(); + } + + if (!FileUtil.canExecute(f)) { + LOG.error("Script is not executable"); + return Optional.empty(); + } + + ShellCommandExecutor shell = new ShellCommandExecutor( + new String[] {path}, + null, + null, + MAX_EXEC_TIMEOUT_MS); + try { + shell.execute(); + String output = shell.getOutput(); + return Optional.of(output); + } catch (IOException e) { + LOG.error("Cannot execute script", e); + return Optional.empty(); + } + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/FpgaNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/FpgaNodeResourceUpdateHandler.java index 7511d8f0b3..fdba9a428b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/FpgaNodeResourceUpdateHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/FpgaNodeResourceUpdateHandler.java @@ -20,6 +20,12 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga; +import static org.apache.hadoop.yarn.api.records.ResourceInformation.FPGA_URI; + +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.ResourceInformation; import org.apache.hadoop.yarn.conf.YarnConfiguration; @@ -30,13 +36,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; - -import static org.apache.hadoop.yarn.api.records.ResourceInformation.FPGA_URI; - public class FpgaNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin { private static final Logger LOG = LoggerFactory.getLogger( FpgaNodeResourceUpdateHandler.class); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/IntelFpgaOpenclPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/IntelFpgaOpenclPlugin.java index f8fb6d8a24..1f9cba78fc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/IntelFpgaOpenclPlugin.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/IntelFpgaOpenclPlugin.java @@ -100,6 +100,7 @@ public boolean initPlugin(Configuration conf) { if (this.initialized) { return true; } + // Find the proper toolchain, mainly aocl String pluginDefaultBinaryName = getDefaultBinaryName(); String pathToExecutable = conf.get(YarnConfiguration.NM_FPGA_PATH_TO_EXEC, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/AoclOutputBasedDiscoveryStrategy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/AoclOutputBasedDiscoveryStrategy.java new file mode 100644 index 0000000000..bfa33ce9e5 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/AoclOutputBasedDiscoveryStrategy.java @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery; + +import java.util.List; + +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.fpga.FpgaResourceAllocator.FpgaDevice; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.AbstractFpgaVendorPlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.FpgaDiscoverer; + +/** + * FPGA device discovery strategy which invokes the "aocl" SDK command + * to retrieve the list of available FPGA cards. + */ +public class AoclOutputBasedDiscoveryStrategy + implements FPGADiscoveryStrategy { + + private final AbstractFpgaVendorPlugin plugin; + + public AoclOutputBasedDiscoveryStrategy(AbstractFpgaVendorPlugin fpgaPlugin) { + this.plugin = fpgaPlugin; + } + + @Override + public List discover() throws ResourceHandlerException { + List list = + plugin.discover(FpgaDiscoverer.MAX_EXEC_TIMEOUT_MS); + if (list.isEmpty()) { + throw new ResourceHandlerException("No FPGA devices detected!"); + } + + return list; + } +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/DeviceSpecParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/DeviceSpecParser.java new file mode 100644 index 0000000000..44f4b6c32a --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/DeviceSpecParser.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.fpga.FpgaResourceAllocator.FpgaDevice; + +/** + * Parses a string which specifies FPGA devices. Multiple devices should be + * separated by a comma. A device specification should contain the + * symbolic name of the device, major and minor device numbers. + * + * Example: "acl0/243:0,acl1/243:1". + */ +public final class DeviceSpecParser { + private static final String DEVICE_SPEC_REGEX = + "(\\w+[0-31])(\\/)(\\d+)(\\:)(\\d+)"; + + private static final Pattern DEVICE_PATTERN = + Pattern.compile(DEVICE_SPEC_REGEX); + + private DeviceSpecParser() { + // no instances + } + + static List getDevicesFromString(String type, String devices) + throws ResourceHandlerException { + if (devices.trim().isEmpty()) { + return Collections.emptyList(); + } + + String[] deviceList = devices.split(","); + + List fpgaDevices = new ArrayList<>(); + + for (final String deviceSpec : deviceList) { + Matcher matcher = DEVICE_PATTERN.matcher(deviceSpec); + if (matcher.matches()) { + try { + String devName = matcher.group(1); + int major = Integer.parseInt(matcher.group(3)); + int minor = Integer.parseInt(matcher.group(5)); + fpgaDevices.add(new FpgaDevice(type, + major, + minor, + null, + null, + devName, + null, + null, + null)); + } catch (NumberFormatException e) { + throw new ResourceHandlerException( + "Cannot parse major/minor number: " + deviceSpec); + } + } else { + throw new ResourceHandlerException( + "Illegal device specification string: " + deviceSpec); + } + } + + return fpgaDevices; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/FPGADiscoveryStrategy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/FPGADiscoveryStrategy.java new file mode 100644 index 0000000000..a707ac672e --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/FPGADiscoveryStrategy.java @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery; + +import java.util.List; + +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.fpga.FpgaResourceAllocator.FpgaDevice; + +/** + * Interface for an FPGA device discovery strategy. + */ +public interface FPGADiscoveryStrategy { + List discover() throws ResourceHandlerException; +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/ScriptBasedFPGADiscoveryStrategy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/ScriptBasedFPGADiscoveryStrategy.java new file mode 100644 index 0000000000..bda4539882 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/ScriptBasedFPGADiscoveryStrategy.java @@ -0,0 +1,66 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery; + +import java.util.List; +import java.util.Optional; +import java.util.function.Function; + +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.fpga.FpgaResourceAllocator.FpgaDevice; + +/** + * FPGA device discovery strategy which invokes an external script. + * The script must return a single line in given format. + * + * See DeviceSpecParser for details. + */ +public class ScriptBasedFPGADiscoveryStrategy + implements FPGADiscoveryStrategy { + + private final Function> scriptRunner; + private final String discoveryScript; + private final String type; + + public ScriptBasedFPGADiscoveryStrategy( + String fpgaType, + Function> scriptRunner, + String propValue) { + this.scriptRunner = scriptRunner; + this.discoveryScript = propValue; + this.type = fpgaType; + } + + @Override + public List discover() throws ResourceHandlerException { + Optional scriptOutput = + scriptRunner.apply(discoveryScript); + if (scriptOutput.isPresent()) { + List list = + DeviceSpecParser.getDevicesFromString(type, scriptOutput.get()); + if (list.isEmpty()) { + throw new ResourceHandlerException("No FPGA devices were specified"); + } + return list; + } else { + throw new ResourceHandlerException("Unable to run external script"); + } + } +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/SettingsBasedFPGADiscoveryStrategy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/SettingsBasedFPGADiscoveryStrategy.java new file mode 100644 index 0000000000..eb7bea8421 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/SettingsBasedFPGADiscoveryStrategy.java @@ -0,0 +1,54 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery; + +import java.util.List; + +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.fpga.FpgaResourceAllocator.FpgaDevice; + +/** + * FPGA device discovery strategy which parses a string. + * The string must consist of a single line and be in a specific format. + * + * See DeviceSpecParser for details. + */ +public class SettingsBasedFPGADiscoveryStrategy + implements FPGADiscoveryStrategy { + + private final String type; + private final String availableDevices; + + public SettingsBasedFPGADiscoveryStrategy( + String fpgaType, String devices) { + this.type = fpgaType; + this.availableDevices = devices; + } + + @Override + public List discover() throws ResourceHandlerException { + List list = + DeviceSpecParser.getDevicesFromString(type, availableDevices); + if (list.isEmpty()) { + throw new ResourceHandlerException("No FPGA devices were specified"); + } + return list; + } +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/package-info.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/package-info.java new file mode 100644 index 0000000000..8a0471ccf2 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/discovery/package-info.java @@ -0,0 +1,19 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/TestFpgaDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/TestFpgaDiscoverer.java index dd99845895..f2b92d2457 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/TestFpgaDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/TestFpgaDiscoverer.java @@ -24,23 +24,34 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.fpga.FpgaResourceAllocator; -import org.junit.Assert; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.fpga.FpgaResourceAllocator.FpgaDevice; import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.lang.reflect.Field; -import java.util.*; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import static org.junit.Assert.assertEquals; import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; public class TestFpgaDiscoverer { + @Rule + public ExpectedException expected = ExpectedException.none(); private String getTestParentFolder() { File f = new File("target/temp/" + TestFpgaDiscoverer.class.getName()); @@ -57,6 +68,7 @@ public void before() throws IOException { File f = new File(folder); FileUtils.deleteDirectory(f); f.mkdirs(); + FpgaDiscoverer.reset(); } // A dirty hack to modify the env of the current JVM itself - Dirty, but @@ -106,7 +118,7 @@ public void testLinuxFpgaResourceDiscoverPluginConfig() throws Exception { discoverer.initialize(conf); // Case 1. No configuration set for binary(no environment "ALTERAOCLSDKROOT" set) - Assert.assertEquals("No configuration(no environment ALTERAOCLSDKROOT set)" + + assertEquals("No configuration(no environment ALTERAOCLSDKROOT set)" + "should return just a single binary name", "aocl", openclPlugin.getPathToExecutable()); @@ -115,19 +127,19 @@ public void testLinuxFpgaResourceDiscoverPluginConfig() throws Exception { conf.set(YarnConfiguration.NM_FPGA_PATH_TO_EXEC, getTestParentFolder() + "/aocl"); touchFile(fakeBinary); discoverer.initialize(conf); - Assert.assertEquals("Correct configuration should return user setting", + assertEquals("Correct configuration should return user setting", getTestParentFolder() + "/aocl", openclPlugin.getPathToExecutable()); // Case 3. With correct configuration but file doesn't exists. Use default fakeBinary.delete(); discoverer.initialize(conf); - Assert.assertEquals("Correct configuration but file doesn't exists should return just a single binary name", + assertEquals("Should return just a single binary name", "aocl", openclPlugin.getPathToExecutable()); // Case 4. Set a empty value conf.set(YarnConfiguration.NM_FPGA_PATH_TO_EXEC, ""); discoverer.initialize(conf); - Assert.assertEquals("configuration with empty string value, should use aocl", + assertEquals("configuration with empty string value, should use aocl", "aocl", openclPlugin.getPathToExecutable()); // Case 5. No configuration set for binary, but set environment "ALTERAOCLSDKROOT" @@ -140,7 +152,7 @@ public void testLinuxFpgaResourceDiscoverPluginConfig() throws Exception { newEnv.put("ALTERAOCLSDKROOT", getTestParentFolder()); setNewEnvironmentHack(newEnv); discoverer.initialize(conf); - Assert.assertEquals("No configuration but with environment ALTERAOCLSDKROOT set", + assertEquals("No configuration but with environment ALTERAOCLSDKROOT set", getTestParentFolder() + "/bin/aocl", openclPlugin.getPathToExecutable()); } @@ -193,39 +205,229 @@ public void testDiscoverPluginParser() throws YarnException { // Case 1. core parsing openclPlugin.parseDiagnoseInfo(output, list); - Assert.assertEquals(3, list.size()); - Assert.assertEquals("IntelOpenCL", list.get(0).getType()); - Assert.assertEquals("247", list.get(0).getMajor().toString()); - Assert.assertEquals("0", list.get(0).getMinor().toString()); - Assert.assertEquals("acl0", list.get(0).getAliasDevName()); - Assert.assertEquals("aclnalla_pcie0", list.get(0).getDevName()); - Assert.assertEquals("02:00.00", list.get(0).getBusNum()); - Assert.assertEquals("53.1 degrees C", list.get(0).getTemperature()); - Assert.assertEquals("31.7 Watts", list.get(0).getCardPowerUsage()); + assertEquals(3, list.size()); + assertEquals("IntelOpenCL", list.get(0).getType()); + assertEquals("247", list.get(0).getMajor().toString()); + assertEquals("0", list.get(0).getMinor().toString()); + assertEquals("acl0", list.get(0).getAliasDevName()); + assertEquals("aclnalla_pcie0", list.get(0).getDevName()); + assertEquals("02:00.00", list.get(0).getBusNum()); + assertEquals("53.1 degrees C", list.get(0).getTemperature()); + assertEquals("31.7 Watts", list.get(0).getCardPowerUsage()); - Assert.assertEquals("IntelOpenCL", list.get(1).getType()); - Assert.assertEquals("247", list.get(1).getMajor().toString()); - Assert.assertEquals("1", list.get(1).getMinor().toString()); - Assert.assertEquals("acl1", list.get(1).getAliasDevName()); - Assert.assertEquals("aclnalla_pcie1", list.get(1).getDevName()); - Assert.assertEquals("03:00.00", list.get(1).getBusNum()); - Assert.assertEquals("43.1 degrees C", list.get(1).getTemperature()); - Assert.assertEquals("11.7 Watts", list.get(1).getCardPowerUsage()); + assertEquals("IntelOpenCL", list.get(1).getType()); + assertEquals("247", list.get(1).getMajor().toString()); + assertEquals("1", list.get(1).getMinor().toString()); + assertEquals("acl1", list.get(1).getAliasDevName()); + assertEquals("aclnalla_pcie1", list.get(1).getDevName()); + assertEquals("03:00.00", list.get(1).getBusNum()); + assertEquals("43.1 degrees C", list.get(1).getTemperature()); + assertEquals("11.7 Watts", list.get(1).getCardPowerUsage()); - Assert.assertEquals("IntelOpenCL", list.get(2).getType()); - Assert.assertEquals("246", list.get(2).getMajor().toString()); - Assert.assertEquals("0", list.get(2).getMinor().toString()); - Assert.assertEquals("acl2", list.get(2).getAliasDevName()); - Assert.assertEquals("acla10_ref0", list.get(2).getDevName()); - Assert.assertEquals("09:00.00", list.get(2).getBusNum()); - Assert.assertEquals("50.5781 degrees C", list.get(2).getTemperature()); - Assert.assertEquals("", list.get(2).getCardPowerUsage()); + assertEquals("IntelOpenCL", list.get(2).getType()); + assertEquals("246", list.get(2).getMajor().toString()); + assertEquals("0", list.get(2).getMinor().toString()); + assertEquals("acl2", list.get(2).getAliasDevName()); + assertEquals("acla10_ref0", list.get(2).getDevName()); + assertEquals("09:00.00", list.get(2).getBusNum()); + assertEquals("50.5781 degrees C", list.get(2).getTemperature()); + assertEquals("", list.get(2).getCardPowerUsage()); // Case 2. check alias map Map aliasMap = openclPlugin.getAliasMap(); - Assert.assertEquals("acl0", aliasMap.get("247:0")); - Assert.assertEquals("acl1", aliasMap.get("247:1")); - Assert.assertEquals("acl2", aliasMap.get("246:0")); + assertEquals("acl0", aliasMap.get("247:0")); + assertEquals("acl1", aliasMap.get("247:1")); + assertEquals("acl2", aliasMap.get("246:0")); + } + + @Test + public void testDiscoveryWhenAvailableDevicesDefined() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_FPGA_AVAILABLE_DEVICES, + "acl0/243:0,acl1/244:1"); + FpgaDiscoverer discoverer = FpgaDiscoverer.getInstance(); + + IntelFpgaOpenclPlugin openclPlugin = new IntelFpgaOpenclPlugin(); + discoverer.setResourceHanderPlugin(openclPlugin); + openclPlugin.initPlugin(conf); + openclPlugin.setShell(mockPuginShell()); + + discoverer.initialize(conf); + List devices = discoverer.discover(); + assertEquals("Number of devices", 2, devices.size()); + FpgaDevice device0 = devices.get(0); + FpgaDevice device1 = devices.get(1); + + assertEquals("Device id", "acl0", device0.getAliasDevName()); + assertEquals("Minor number", new Integer(0), device0.getMinor()); + assertEquals("Major", new Integer(243), device0.getMajor()); + + assertEquals("Device id", "acl1", device1.getAliasDevName()); + assertEquals("Minor number", new Integer(1), device1.getMinor()); + assertEquals("Major", new Integer(244), device1.getMajor()); + } + + @Test + public void testDiscoveryWhenAvailableDevicesEmpty() + throws YarnException { + expected.expect(ResourceHandlerException.class); + expected.expectMessage("No FPGA devices were specified"); + + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_FPGA_AVAILABLE_DEVICES, + ""); + FpgaDiscoverer discoverer = FpgaDiscoverer.getInstance(); + + IntelFpgaOpenclPlugin openclPlugin = new IntelFpgaOpenclPlugin(); + discoverer.setResourceHanderPlugin(openclPlugin); + openclPlugin.initPlugin(conf); + openclPlugin.setShell(mockPuginShell()); + + discoverer.initialize(conf); + discoverer.discover(); + } + + @Test + public void testDiscoveryWhenAvailableDevicesAreIllegalString() + throws YarnException { + expected.expect(ResourceHandlerException.class); + expected.expectMessage("Illegal device specification string"); + + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_FPGA_AVAILABLE_DEVICES, + "illegal/243:0,acl1/244=1"); + FpgaDiscoverer discoverer = FpgaDiscoverer.getInstance(); + + IntelFpgaOpenclPlugin openclPlugin = new IntelFpgaOpenclPlugin(); + discoverer.setResourceHanderPlugin(openclPlugin); + openclPlugin.initPlugin(conf); + openclPlugin.setShell(mockPuginShell()); + + discoverer.initialize(conf); + discoverer.discover(); + } + + @Test + public void testDiscoveryWhenExternalScriptDefined() + throws YarnException { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_FPGA_DEVICE_DISCOVERY_SCRIPT, + "/dummy/script"); + FpgaDiscoverer discoverer = FpgaDiscoverer.getInstance(); + + IntelFpgaOpenclPlugin openclPlugin = new IntelFpgaOpenclPlugin(); + discoverer.setResourceHanderPlugin(openclPlugin); + openclPlugin.initPlugin(conf); + openclPlugin.setShell(mockPuginShell()); + discoverer.setScriptRunner(s -> { + return Optional.of("acl0/243:0,acl1/244:1"); }); + + discoverer.initialize(conf); + List devices = discoverer.discover(); + assertEquals("Number of devices", 2, devices.size()); + FpgaDevice device0 = devices.get(0); + FpgaDevice device1 = devices.get(1); + + assertEquals("Device id", "acl0", device0.getAliasDevName()); + assertEquals("Minor number", new Integer(0), device0.getMinor()); + assertEquals("Major", new Integer(243), device0.getMajor()); + + assertEquals("Device id", "acl1", device1.getAliasDevName()); + assertEquals("Minor number", new Integer(1), device1.getMinor()); + assertEquals("Major", new Integer(244), device1.getMajor()); + } + + @Test + public void testDiscoveryWhenExternalScriptReturnsEmptyString() + throws YarnException { + expected.expect(ResourceHandlerException.class); + expected.expectMessage("No FPGA devices were specified"); + + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_FPGA_DEVICE_DISCOVERY_SCRIPT, + "/dummy/script"); + FpgaDiscoverer discoverer = FpgaDiscoverer.getInstance(); + + IntelFpgaOpenclPlugin openclPlugin = new IntelFpgaOpenclPlugin(); + discoverer.setResourceHanderPlugin(openclPlugin); + openclPlugin.initPlugin(conf); + openclPlugin.setShell(mockPuginShell()); + discoverer.setScriptRunner(s -> { + return Optional.of(""); }); + + discoverer.initialize(conf); + discoverer.discover(); + } + + @Test + + public void testDiscoveryWhenExternalScriptFails() + throws YarnException { + expected.expect(ResourceHandlerException.class); + expected.expectMessage("Unable to run external script"); + + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_FPGA_DEVICE_DISCOVERY_SCRIPT, + "/dummy/script"); + FpgaDiscoverer discoverer = FpgaDiscoverer.getInstance(); + + IntelFpgaOpenclPlugin openclPlugin = new IntelFpgaOpenclPlugin(); + discoverer.setResourceHanderPlugin(openclPlugin); + openclPlugin.initPlugin(conf); + openclPlugin.setShell(mockPuginShell()); + discoverer.setScriptRunner(s -> { + return Optional.empty(); }); + + discoverer.initialize(conf); + discoverer.discover(); + } + + @Test + public void testDiscoveryWhenExternalScriptUndefined() + throws YarnException { + expected.expect(ResourceHandlerException.class); + expected.expectMessage("Unable to run external script"); + + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_FPGA_DEVICE_DISCOVERY_SCRIPT, ""); + FpgaDiscoverer discoverer = FpgaDiscoverer.getInstance(); + + IntelFpgaOpenclPlugin openclPlugin = new IntelFpgaOpenclPlugin(); + discoverer.setResourceHanderPlugin(openclPlugin); + openclPlugin.initPlugin(conf); + openclPlugin.setShell(mockPuginShell()); + + discoverer.initialize(conf); + discoverer.discover(); + } + + @Test + public void testDiscoveryWhenExternalScriptCannotBeExecuted() + throws YarnException, IOException { + File fakeScript = new File(getTestParentFolder() + "/fakeScript"); + try { + expected.expect(ResourceHandlerException.class); + expected.expectMessage("Unable to run external script"); + + Configuration conf = new Configuration(false); + fakeScript = new File(getTestParentFolder() + "/fakeScript"); + touchFile(fakeScript); + fakeScript.setExecutable(false); + conf.set(YarnConfiguration.NM_FPGA_DEVICE_DISCOVERY_SCRIPT, + fakeScript.getAbsolutePath()); + FpgaDiscoverer discoverer = FpgaDiscoverer.getInstance(); + + IntelFpgaOpenclPlugin openclPlugin = new IntelFpgaOpenclPlugin(); + discoverer.setResourceHanderPlugin(openclPlugin); + openclPlugin.initPlugin(conf); + openclPlugin.setShell(mockPuginShell()); + + discoverer.initialize(conf); + discoverer.discover(); + } finally { + fakeScript.delete(); + } } private IntelFpgaOpenclPlugin.InnerShellExecutor mockPuginShell() {