From 729ccb2cabde67ee68600438b334a4f65d947092 Mon Sep 17 00:00:00 2001 From: Zhankun Tang Date: Sun, 19 May 2019 21:18:33 +0800 Subject: [PATCH] SUBMARINE-58. Submarine client needs to generate fat jar. Contributed by Zac Zhou. --- hadoop-submarine/hadoop-submarine-all/pom.xml | 183 ++++++++++ .../hadoop-submarine-core/pom.xml | 4 + .../cli/param/runjob/RunJobParameters.java | 5 +- .../runjob/TensorFlowRunJobParameters.java | 10 +- .../common/resource/ResourceUtils.java | 332 ++++++++++++++++++ .../common/resource/UnitsConversionUtil.java | 164 +++++++++ .../common/resource/package-info.java | 19 + .../src/site/markdown/QuickStart.md | 63 +++- .../TestRunJobCliParsingCommonYaml.java | 21 +- .../TestRunJobCliParsingPyTorchYaml.java | 82 +++-- .../TestRunJobCliParsingTensorFlowYaml.java | 85 +++-- ...JobCliParsingTensorFlowYamlStandalone.java | 6 +- .../submarine/common/MockClientContext.java | 23 -- .../runjob-common-yaml/empty-framework.yaml | 6 +- .../runjob-common-yaml/missing-configs.yaml | 6 +- .../runjob-common-yaml/missing-framework.yaml | 6 +- .../some-sections-missing.yaml | 4 +- .../runjob-common-yaml/test-false-values.yaml | 4 +- .../runjob-common-yaml/wrong-indentation.yaml | 6 +- .../wrong-property-name.yaml | 6 +- .../runjob-pytorch-yaml/envs-are-missing.yaml | 2 +- .../invalid-config-ps-section.yaml | 2 +- .../invalid-config-tensorboard-section.yaml | 2 +- .../security-principal-is-missing.yaml | 2 +- .../valid-config-with-overrides.yaml | 2 +- .../runjob-pytorch-yaml/valid-config.yaml | 2 +- .../runjob-pytorch-yaml/valid-gpu-config.yaml | 54 +++ .../envs-are-missing.yaml | 6 +- .../security-principal-is-missing.yaml | 6 +- .../tensorboard-dockerimage-is-missing.yaml | 6 +- .../valid-config-with-overrides.yaml | 6 +- .../runjob-tensorflow-yaml/valid-config.yaml | 6 +- .../valid-gpu-config.yaml | 63 ++++ .../hadoop-submarine-dist/pom.xml | 131 +++++++ .../src/assembly/distribution.xml | 61 ++++ .../hadoop-submarine-tony-runtime/pom.xml | 1 + .../submarine/runtimes/tony/TonyUtils.java | 21 +- .../pom.xml | 4 +- .../yarnservice/AbstractComponent.java | 6 +- .../TestTensorFlowWorkerComponent.java | 6 +- .../utils/TestSubmarineResourceUtils.java | 50 ++- hadoop-submarine/pom.xml | 133 ++++++- 42 files changed, 1424 insertions(+), 183 deletions(-) create mode 100644 hadoop-submarine/hadoop-submarine-all/pom.xml create mode 100644 hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/common/resource/ResourceUtils.java create mode 100644 hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/common/resource/UnitsConversionUtil.java create mode 100644 hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/common/resource/package-info.java create mode 100644 hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/valid-gpu-config.yaml create mode 100644 hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/valid-gpu-config.yaml create mode 100644 hadoop-submarine/hadoop-submarine-dist/pom.xml create mode 100644 hadoop-submarine/hadoop-submarine-dist/src/assembly/distribution.xml diff --git a/hadoop-submarine/hadoop-submarine-all/pom.xml b/hadoop-submarine/hadoop-submarine-all/pom.xml new file mode 100644 index 0000000000..e2d2e172d0 --- /dev/null +++ b/hadoop-submarine/hadoop-submarine-all/pom.xml @@ -0,0 +1,183 @@ + + + + 4.0.0 + + hadoop-submarine + org.apache.hadoop + 0.2.0-SNAPSHOT + + ${project.artifactId} + ${project.version} + Hadoop Submarine All + + + + ${project.parent.parent.basedir} + hadoop-submarine-all + 0.2.0-SNAPSHOT + + + + + + + + org.apache.hadoop + hadoop-hdfs + + + + org.apache.hadoop + hadoop-common + + + + org.apache.hadoop + hadoop-submarine-core + ${project.version} + + + + + + + hadoop-3.2 + + + org.apache.hadoop + hadoop-submarine-yarnservice-runtime + ${project.version} + + + org.apache.hadoop + hadoop-submarine-tony-runtime + ${project.version} + + + org.apache.hadoop + hadoop-hdfs-client + ${hadoop.version} + + + + + + + hadoop-3.1 + + true + + + + org.apache.hadoop + hadoop-submarine-yarnservice-runtime + ${project.version} + + + org.apache.hadoop + hadoop-submarine-tony-runtime + ${project.version} + + + org.apache.hadoop + hadoop-hdfs-client + ${hadoop.version} + + + + + + hadoop-2.9 + + + org.apache.hadoop + hadoop-submarine-tony-runtime + ${project.version} + + + org.apache.hadoop + hadoop-hdfs-client + ${hadoop.version} + + + + + + hadoop-2.7 + + + org.apache.hadoop + hadoop-submarine-tony-runtime + ${project.version} + + + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.2.1 + + + package + + shade + + + + target/${project.artifactId}-${project.version}-${project.activeProfiles[0].id}.jar + + + classworlds:classworlds + junit:junit + jmock:* + *:xml-apis + org.apache.maven:lib:tests + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + org.apache.hadoop.yarn.submarine.client.cli.Cli + + + + + + + + + + + diff --git a/hadoop-submarine/hadoop-submarine-core/pom.xml b/hadoop-submarine/hadoop-submarine-core/pom.xml index 4a387fcdfd..3b267fa436 100644 --- a/hadoop-submarine/hadoop-submarine-core/pom.xml +++ b/hadoop-submarine/hadoop-submarine-core/pom.xml @@ -67,6 +67,10 @@ org.yaml snakeyaml + + org.apache.commons + commons-lang3 + diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/client/cli/param/runjob/RunJobParameters.java b/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/client/cli/param/runjob/RunJobParameters.java index 629b1be1fc..037593b2dd 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/client/cli/param/runjob/RunJobParameters.java +++ b/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/client/cli/param/runjob/RunJobParameters.java @@ -45,7 +45,7 @@ import org.apache.hadoop.yarn.submarine.common.ClientContext; import org.apache.hadoop.yarn.submarine.common.api.TensorFlowRole; import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager; -import org.apache.hadoop.yarn.util.resource.ResourceUtils; +import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils; import org.yaml.snakeyaml.introspector.Property; import org.yaml.snakeyaml.introspector.PropertyUtils; @@ -271,8 +271,7 @@ private Resource determineWorkerResource(ParametersHolder parametersHolder, throw new ParseException( "--" + CliConstants.WORKER_RES + " is absent."); } - return ResourceUtils.createResourceFromString(workerResourceStr, - clientContext.getOrCreateYarnClient().getResourceTypeInfo()); + return ResourceUtils.createResourceFromString(workerResourceStr); } return null; } diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/client/cli/param/runjob/TensorFlowRunJobParameters.java b/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/client/cli/param/runjob/TensorFlowRunJobParameters.java index a5f6ad6960..b0deeb70d8 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/client/cli/param/runjob/TensorFlowRunJobParameters.java +++ b/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/client/cli/param/runjob/TensorFlowRunJobParameters.java @@ -27,7 +27,7 @@ import org.apache.hadoop.yarn.submarine.client.cli.runjob.RoleParameters; import org.apache.hadoop.yarn.submarine.common.ClientContext; import org.apache.hadoop.yarn.submarine.common.api.TensorFlowRole; -import org.apache.hadoop.yarn.util.resource.ResourceUtils; +import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils; import java.io.IOException; import java.util.List; @@ -127,8 +127,7 @@ private Resource determinePSResource(ParametersHolder parametersHolder, if (psResourceStr == null) { throw new ParseException("--" + CliConstants.PS_RES + " is absent."); } - return ResourceUtils.createResourceFromString(psResourceStr, - clientContext.getOrCreateYarnClient().getResourceTypeInfo()); + return ResourceUtils.createResourceFromString(psResourceStr); } return null; } @@ -151,9 +150,8 @@ private RoleParameters getTensorBoardParameters( if (tensorboardResourceStr == null || tensorboardResourceStr.isEmpty()) { tensorboardResourceStr = CliConstants.TENSORBOARD_DEFAULT_RESOURCES; } - Resource tensorboardResource = - ResourceUtils.createResourceFromString(tensorboardResourceStr, - clientContext.getOrCreateYarnClient().getResourceTypeInfo()); + Resource tensorboardResource = ResourceUtils.createResourceFromString( + tensorboardResourceStr); String tensorboardDockerImage = parametersHolder.getOptionValue(CliConstants.TENSORBOARD_DOCKER_IMAGE); return new RoleParameters(TensorFlowRole.TENSORBOARD, 1, null, diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/common/resource/ResourceUtils.java b/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/common/resource/ResourceUtils.java new file mode 100644 index 0000000000..375ae354ad --- /dev/null +++ b/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/common/resource/ResourceUtils.java @@ -0,0 +1,332 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. See accompanying LICENSE file. + */ + +package org.apache.hadoop.yarn.submarine.common.resource; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * This class implements some methods with the almost the same logic as + * org.apache.hadoop.yarn.util.resource.ResourceUtils of hadoop 3.3. + * If the hadoop dependencies are upgraded to 3.3, this class can be refactored + * with org.apache.hadoop.yarn.util.resource.ResourceUtils. + */ +public final class ResourceUtils { + + private final static String RES_PATTERN = "^[^=]+=\\d+\\s?\\w*$"; + private final static String SET_RESOURCE_VALUE_METHOD = "setResourceValue"; + private final static String SET_MEMORY_SIZE_METHOD = "setMemorySize"; + private final static String DEPRECATED_SET_MEMORY_SIZE_METHOD = + "setMemory"; + private final static String GET_MEMORY_SIZE_METHOD = "getMemorySize"; + private final static String DEPRECATED_GET_MEMORY_SIZE_METHOD = + "getMemory"; + private final static String GET_RESOURCE_VALUE_METHOD = "getResourceValue"; + private final static String GET_RESOURCE_TYPE_METHOD = + "getResourcesTypeInfo"; + private final static String REINITIALIZE_RESOURCES_METHOD = + "reinitializeResources"; + public static final String MEMORY_URI = "memory-mb"; + public static final String VCORES_URI = "vcores"; + public static final String GPU_URI = "yarn.io/gpu"; + public static final String FPGA_URI = "yarn.io/fpga"; + + private static final Logger LOG = + LoggerFactory.getLogger(ResourceUtils.class); + + private ResourceUtils() {} + + public static Resource createResourceFromString(String resourceStr) { + Map typeToValue = parseResourcesString(resourceStr); + Resource resource = Resource.newInstance(0, 0); + for (Map.Entry entry : typeToValue.entrySet()) { + if(entry.getKey().equals(VCORES_URI)) { + resource.setVirtualCores(entry.getValue().intValue()); + continue; + } else if (entry.getKey().equals(MEMORY_URI)) { + setMemorySize(resource, entry.getValue()); + continue; + } + setResource(resource, entry.getKey(), entry.getValue().intValue()); + } + return resource; + } + + private static Map parseResourcesString(String resourcesStr) { + Map resources = new HashMap<>(); + String[] pairs = resourcesStr.trim().split(","); + for (String resource : pairs) { + resource = resource.trim(); + if (!resource.matches(RES_PATTERN)) { + throw new IllegalArgumentException("\"" + resource + "\" is not a " + + "valid resource type/amount pair. " + + "Please provide key=amount pairs separated by commas."); + } + String[] splits = resource.split("="); + String key = splits[0], value = splits[1]; + String units = getUnits(value); + + String valueWithoutUnit = value.substring(0, + value.length()- units.length()).trim(); + long resourceValue = Long.parseLong(valueWithoutUnit); + + // Convert commandline unit to standard YARN unit. + if (units.equals("M") || units.equals("m")) { + units = "Mi"; + } else if (units.equals("G") || units.equals("g")) { + units = "Gi"; + } else if (!units.isEmpty()){ + throw new IllegalArgumentException("Acceptable units are M/G or empty"); + } + + // special handle memory-mb and memory + if (key.equals(MEMORY_URI)) { + if (!units.isEmpty()) { + resourceValue = UnitsConversionUtil.convert(units, "Mi", + resourceValue); + } + } + + if (key.equals("memory")) { + key = MEMORY_URI; + resourceValue = UnitsConversionUtil.convert(units, "Mi", + resourceValue); + } + + // special handle gpu + if (key.equals("gpu")) { + key = GPU_URI; + } + + // special handle fpga + if (key.equals("fpga")) { + key = FPGA_URI; + } + + resources.put(key, resourceValue); + } + return resources; + } + + /** + * As hadoop 2.9.2 and lower don't support resources except cpu and memory. + * Use reflection to set GPU or other resources for compatibility with + * hadoop 2.9.2 + */ + public static void setResource(Resource resource, String resourceName, + int resourceValue) { + try { + Method method = resource.getClass().getMethod(SET_RESOURCE_VALUE_METHOD, + String.class, long.class); + method.invoke(resource, resourceName, resourceValue); + } catch (NoSuchMethodException e) { + LOG.error("There is no '" + SET_RESOURCE_VALUE_METHOD + "' API in this" + + "version of YARN", e); + throw new SubmarineRuntimeException(e.getMessage(), e.getCause()); + } catch (IllegalAccessException | InvocationTargetException e) { + LOG.error("Failed to invoke '" + SET_RESOURCE_VALUE_METHOD + + "' method to set GPU resources", e); + throw new SubmarineRuntimeException(e.getMessage(), e.getCause()); + } + return; + } + + public static void setMemorySize(Resource resource, Long memorySize) { + boolean useWithIntParameter = false; + // For hadoop 2.9.2 and above + try { + Method method = resource.getClass().getMethod(SET_MEMORY_SIZE_METHOD, + long.class); + method.setAccessible(true); + method.invoke(resource, memorySize); + } catch (NoSuchMethodException nsme) { + LOG.info("There is no '" + SET_MEMORY_SIZE_METHOD + "(long)' API in" + + " this version of YARN"); + useWithIntParameter = true; + } catch (IllegalAccessException | InvocationTargetException e) { + LOG.error("Failed to invoke '" + SET_MEMORY_SIZE_METHOD + + "' method", e); + throw new SubmarineRuntimeException(e.getMessage(), e.getCause()); + } + // For hadoop 2.7.3 + if (useWithIntParameter) { + try { + LOG.info("Trying to use '" + DEPRECATED_SET_MEMORY_SIZE_METHOD + + "(int)' API for this version of YARN"); + Method method = resource.getClass().getMethod( + DEPRECATED_SET_MEMORY_SIZE_METHOD, int.class); + method.invoke(resource, memorySize.intValue()); + } catch (NoSuchMethodException e) { + LOG.error("There is no '" + DEPRECATED_SET_MEMORY_SIZE_METHOD + + "(int)' API in this version of YARN", e); + throw new SubmarineRuntimeException(e.getMessage(), e.getCause()); + } catch (IllegalAccessException | InvocationTargetException e) { + LOG.error("Failed to invoke '" + DEPRECATED_SET_MEMORY_SIZE_METHOD + + "' method", e); + throw new SubmarineRuntimeException(e.getMessage(), e.getCause()); + } + } + } + + public static long getMemorySize(Resource resource) { + boolean useWithIntParameter = false; + long memory = 0; + // For hadoop 2.9.2 and above + try { + Method method = resource.getClass().getMethod(GET_MEMORY_SIZE_METHOD); + method.setAccessible(true); + memory = (long) method.invoke(resource); + } catch (NoSuchMethodException e) { + LOG.info("There is no '" + GET_MEMORY_SIZE_METHOD + "' API in" + + " this version of YARN"); + useWithIntParameter = true; + } catch (IllegalAccessException | InvocationTargetException e) { + LOG.error("Failed to invoke '" + GET_MEMORY_SIZE_METHOD + + "' method", e); + throw new SubmarineRuntimeException(e.getMessage(), e.getCause()); + } + // For hadoop 2.7.3 + if (useWithIntParameter) { + try { + LOG.info("Trying to use '" + DEPRECATED_GET_MEMORY_SIZE_METHOD + + "' API for this version of YARN"); + Method method = resource.getClass().getMethod( + DEPRECATED_GET_MEMORY_SIZE_METHOD); + method.setAccessible(true); + memory = ((Integer) method.invoke(resource)).longValue(); + } catch (NoSuchMethodException e) { + LOG.error("There is no '" + DEPRECATED_GET_MEMORY_SIZE_METHOD + + "' API in this version of YARN", e); + throw new SubmarineRuntimeException(e.getMessage(), e.getCause()); + } catch (IllegalAccessException | InvocationTargetException e) { + LOG.error("Failed to invoke '" + DEPRECATED_GET_MEMORY_SIZE_METHOD + + "' method", e); + throw new SubmarineRuntimeException(e.getMessage(), e.getCause()); + } + } + return memory; + } + + /** + * As hadoop 2.9.2 and lower don't support resources except cpu and memory. + * Use reflection to set GPU or other resources for compatibility with + * hadoop 2.9.2 + */ + public static long getResourceValue(Resource resource, String resourceName) { + long resourceValue = 0; + try { + Method method = resource.getClass().getMethod(GET_RESOURCE_VALUE_METHOD, + String.class); + Object value = method.invoke(resource, resourceName); + resourceValue = (long) value; + } catch (NoSuchMethodException e) { + LOG.info("There is no '" + GET_RESOURCE_VALUE_METHOD + "' API in this" + + " version of YARN"); + } catch (InvocationTargetException e) { + if (e.getTargetException().getClass().getName().equals( + "org.apache.hadoop.yarn.exceptions.ResourceNotFoundException")) { + LOG.info("Not found resource " + resourceName); + } else { + LOG.info("Failed to invoke '" + GET_RESOURCE_VALUE_METHOD + "'" + + " method to get resource " + resourceName); + throw new SubmarineRuntimeException(e.getMessage(), e.getCause()); + } + } catch (IllegalAccessException | ClassCastException e) { + LOG.error("Failed to invoke '" + GET_RESOURCE_VALUE_METHOD + + "' method to get resource " + resourceName, e); + throw new SubmarineRuntimeException(e.getMessage(), e.getCause()); + } + return resourceValue; + } + + /** + * As hadoop 2.9.2 and lower don't support resources except cpu and memory. + * Use reflection to add GPU or other resources for compatibility with + * hadoop 2.9.2 + */ + public static void configureResourceType(String resrouceName) { + Class resourceTypeInfo; + try{ + resourceTypeInfo = Class.forName( + "org.apache.hadoop.yarn.api.records.ResourceTypeInfo"); + Class resourceUtils = Class.forName( + "org.apache.hadoop.yarn.util.resource.ResourceUtils"); + Method method = resourceUtils.getMethod(GET_RESOURCE_TYPE_METHOD); + Object resTypes = method.invoke(null); + + Method resourceTypeInstance = resourceTypeInfo.getMethod("newInstance", + String.class, String.class); + Object resourceType = resourceTypeInstance.invoke(null, resrouceName, ""); + ((ArrayList)resTypes).add(resourceType); + + Method reInitialMethod = resourceUtils.getMethod( + REINITIALIZE_RESOURCES_METHOD, List.class); + reInitialMethod.invoke(null, resTypes); + + } catch (ClassNotFoundException e) { + LOG.info("There is no specified class API in this" + + " version of YARN"); + LOG.info(e.getMessage()); + throw new SubmarineRuntimeException(e.getMessage(), e.getCause()); + } catch (NoSuchMethodException nsme) { + LOG.info("There is no '" + GET_RESOURCE_VALUE_METHOD + "' API in this" + + " version of YARN"); + } catch (IllegalAccessException | InvocationTargetException e) { + LOG.info("Failed to invoke 'configureResourceType' method ", e); + throw new SubmarineRuntimeException(e.getMessage(), e.getCause()); + } + } + + private static String getUnits(String resourceValue) { + return parseResourceValue(resourceValue)[0]; + } + + /** + * Extract unit and actual value from resource value. + * @param resourceValue Value of the resource + * @return Array containing unit and value. [0]=unit, [1]=value + * @throws IllegalArgumentException if units contain non alpha characters + */ + private static String[] parseResourceValue(String resourceValue) { + String[] resource = new String[2]; + int i = 0; + for (; i < resourceValue.length(); i++) { + if (Character.isAlphabetic(resourceValue.charAt(i))) { + break; + } + } + String units = resourceValue.substring(i); + + if (StringUtils.isAlpha(units) || units.equals("")) { + resource[0] = units; + resource[1] = resourceValue.substring(0, i); + return resource; + } else { + throw new IllegalArgumentException("Units '" + units + "'" + + " contains non alphabet characters, which is not allowed."); + } + } + +} diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/common/resource/UnitsConversionUtil.java b/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/common/resource/UnitsConversionUtil.java new file mode 100644 index 0000000000..8a8635a180 --- /dev/null +++ b/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/common/resource/UnitsConversionUtil.java @@ -0,0 +1,164 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.submarine.common.resource; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Almost the same logic as UnitsConversionUtil[YARN-4081]. If the dependencies + * are upgraded to hadoop 3.*, this class can be replaced. + */ +public final class UnitsConversionUtil { + + private UnitsConversionUtil() {} + + /** + * Helper class for encapsulating conversion values. + */ + public static class Converter { + private long numerator; + private long denominator; + + Converter(long n, long d) { + this.numerator = n; + this.denominator = d; + } + } + + private static final String[] UNITS = {"p", "n", "u", "m", "", "k", "M", "G", + "T", "P", "Ki", "Mi", "Gi", "Ti", "Pi"}; + private static final List SORTED_UNITS = Arrays.asList(UNITS); + public static final Set KNOWN_UNITS = createKnownUnitsSet(); + private static final Converter PICO = + new Converter(1L, 1000L * 1000L * 1000L * 1000L); + private static final Converter NANO = + new Converter(1L, 1000L * 1000L * 1000L); + private static final Converter MICRO = new Converter(1L, 1000L * 1000L); + private static final Converter MILLI = new Converter(1L, 1000L); + private static final Converter BASE = new Converter(1L, 1L); + private static final Converter KILO = new Converter(1000L, 1L); + private static final Converter MEGA = new Converter(1000L * 1000L, 1L); + private static final Converter GIGA = + new Converter(1000L * 1000L * 1000L, 1L); + private static final Converter TERA = + new Converter(1000L * 1000L * 1000L * 1000L, 1L); + private static final Converter PETA = + new Converter(1000L * 1000L * 1000L * 1000L * 1000L, 1L); + + private static final Converter KILO_BINARY = new Converter(1024L, 1L); + private static final Converter MEGA_BINARY = new Converter(1024L * 1024L, 1L); + private static final Converter GIGA_BINARY = + new Converter(1024L * 1024L * 1024L, 1L); + private static final Converter TERA_BINARY = + new Converter(1024L * 1024L * 1024L * 1024L, 1L); + private static final Converter PETA_BINARY = + new Converter(1024L * 1024L * 1024L * 1024L * 1024L, 1L); + + private static Set createKnownUnitsSet() { + Set ret = new HashSet<>(); + ret.addAll(Arrays.asList(UNITS)); + return ret; + } + + private static Converter getConverter(String unit) { + switch (unit) { + case "p": + return PICO; + case "n": + return NANO; + case "u": + return MICRO; + case "m": + return MILLI; + case "": + return BASE; + case "k": + return KILO; + case "M": + return MEGA; + case "G": + return GIGA; + case "T": + return TERA; + case "P": + return PETA; + case "Ki": + return KILO_BINARY; + case "Mi": + return MEGA_BINARY; + case "Gi": + return GIGA_BINARY; + case "Ti": + return TERA_BINARY; + case "Pi": + return PETA_BINARY; + default: + throw new IllegalArgumentException( + "Unknown unit '" + unit + "'. Known units are " + KNOWN_UNITS); + } + } + + /** + * Converts a value from one unit to another. Supported units can be obtained + * by inspecting the KNOWN_UNITS set. + * + * @param fromUnit the unit of the from value + * @param toUnit the target unit + * @param fromValue the value you wish to convert + * @return the value in toUnit + */ + public static long convert(String fromUnit, String toUnit, long fromValue) { + if (toUnit == null || fromUnit == null) { + throw new IllegalArgumentException("One or more arguments are null"); + } + + if (fromUnit.equals(toUnit)) { + return fromValue; + } + Converter fc = getConverter(fromUnit); + Converter tc = getConverter(toUnit); + long numerator = fc.numerator * tc.denominator; + long denominator = fc.denominator * tc.numerator; + long numeratorMultiplierLimit = Long.MAX_VALUE / numerator; + if (numerator < denominator) { + if (numeratorMultiplierLimit < fromValue) { + String overflowMsg = + "Converting " + fromValue + " from '" + fromUnit + "' to '" + toUnit + + "' will result in an overflow of Long"; + throw new IllegalArgumentException(overflowMsg); + } + return (fromValue * numerator) / denominator; + } + if (numeratorMultiplierLimit > fromValue) { + return (numerator * fromValue) / denominator; + } + long tmp = numerator / denominator; + if ((Long.MAX_VALUE / tmp) < fromValue) { + String overflowMsg = + "Converting " + fromValue + " from '" + fromUnit + "' to '" + toUnit + + "' will result in an overflow of Long"; + throw new IllegalArgumentException(overflowMsg); + } + return fromValue * tmp; + } + +} diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/common/resource/package-info.java b/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/common/resource/package-info.java new file mode 100644 index 0000000000..144316b12f --- /dev/null +++ b/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/common/resource/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * This package contains resource utility classes. + */ +package org.apache.hadoop.yarn.submarine.common.resource; \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/site/markdown/QuickStart.md b/hadoop-submarine/hadoop-submarine-core/src/site/markdown/QuickStart.md index 5648c11e64..f693917d90 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/site/markdown/QuickStart.md +++ b/hadoop-submarine/hadoop-submarine-core/src/site/markdown/QuickStart.md @@ -164,6 +164,22 @@ See below screenshot: ![alt text](./images/tensorboard-service.png "Tensorboard service") +If there is no hadoop client, we can also use the java command and the uber jar, hadoop-submarine-all-*.jar, to submit the job. + +``` +java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \ + org.apache.hadoop.yarn.submarine.client.cli.Cli job run \ + --env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name tf-job-001 \ + --docker_image \ + --input_path hdfs://default/dataset/cifar-10-data \ + --checkpoint_path hdfs://default/tmp/cifar-10-jobdir \ + --worker_resources memory=4G,vcores=2,gpu=2 \ + --worker_launch_cmd "python ... (Your training application cmd)" \ + --tensorboard # this will launch a companion tensorboard container for monitoring +``` + + ### Launch Distributed Tensorflow Application: #### Commandline @@ -181,6 +197,20 @@ yarn jar hadoop-yarn-applications-submarine-.jar job run \ --num_ps 2 \ --ps_resources memory=4G,vcores=2,gpu=0 --ps_launch_cmd "cmd for ps" \ ``` +Or +``` +java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \ + org.apache.hadoop.yarn.submarine.client.cli.Cli job run \ + --name tf-job-001 --docker_image \ + --input_path hdfs://default/dataset/cifar-10-data \ + --checkpoint_path hdfs://default/tmp/cifar-10-jobdir \ + --env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \ + --num_workers 2 \ + --worker_resources memory=8G,vcores=2,gpu=1 --worker_launch_cmd "cmd for worker ..." \ + --num_ps 2 \ + --ps_resources memory=4G,vcores=2,gpu=0 --ps_launch_cmd "cmd for ps" \ +``` #### Notes: @@ -197,7 +227,11 @@ yarn jar hadoop-yarn-applications-submarine-.jar job run \ ``` yarn jar hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job show --name tf-job-001 ``` - +Or +``` +java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \ + org.apache.hadoop.yarn.submarine.client.cli.Cli job show --name tf-job-001 +``` Output looks like: ``` Job Meta Info: @@ -222,6 +256,17 @@ yarn jar /tmp/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \ --env DOCKER_HADOOP_HDFS_HOME=/hadoop-current \ --num_workers 0 --tensorboard ``` +Or +``` +# Cleanup previous service if needed +yarn app -destroy tensorboard-service; \ +java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \ + org.apache.hadoop.yarn.submarine.client.cli.Cli job run \ + --name tensorboard-service --verbose --docker_image wtan/tf-1.8.0-cpu:0.0.3 \ + --env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \ + --num_workers 0 --tensorboard +``` You can view multiple job training history like from the `Tensorboard` link: @@ -242,4 +287,18 @@ If you want to build the Submarine project by yourself, you can follow the steps - Run 'mvn install -DskipTests' from Hadoop source top level once. -- Navigate to hadoop-submarine folder and run 'mvn clean package'. \ No newline at end of file +- Navigate to hadoop-submarine folder and run 'mvn clean package'. + + - By Default, hadoop-submarine is built based on hadoop 3.1.2 dependencies. + Both yarn service runtime and tony runtime are built. + You can also add a parameter of "-Phadoop-3.2" to specify the dependencies + to hadoop 3.2.0. + + - Hadoop-submarine can support hadoop 2.9.2 and hadoop 2.7.4 as well. + You can add "-Phadoop-2.9" to build submarine based on hadoop 2.9.2. + For example: + ``` + mvn clean package -Phadoop-2.9 + ``` + As yarn service is based on hadoop 3.*, so only tony runtime is built + in this case. diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/TestRunJobCliParsingCommonYaml.java b/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/TestRunJobCliParsingCommonYaml.java index 5e2da3af7b..f174f04cf7 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/TestRunJobCliParsingCommonYaml.java +++ b/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/TestRunJobCliParsingCommonYaml.java @@ -16,15 +16,14 @@ package org.apache.hadoop.yarn.submarine.client.cli.runjob; -import org.apache.hadoop.yarn.api.records.ResourceInformation; -import org.apache.hadoop.yarn.api.records.ResourceTypeInfo; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils; import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters; import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters; import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlParseException; import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs; -import org.apache.hadoop.yarn.util.resource.ResourceUtils; +import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException; +import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -32,10 +31,10 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.File; -import java.util.ArrayList; -import java.util.List; import static org.apache.hadoop.yarn.submarine.client.cli.runjob.TestRunJobCliParsingCommon.getMockClientContext; import static org.junit.Assert.assertFalse; @@ -49,6 +48,8 @@ public class TestRunJobCliParsingCommonYaml { private static final String DIR_NAME = "runjob-common-yaml"; private static final String TF_DIR = "runjob-pytorch-yaml"; private File yamlConfig; + private static Logger LOG = LoggerFactory.getLogger( + TestRunJobCliParsingCommonYaml.class); @Before public void before() { @@ -62,10 +63,12 @@ public void after() { @BeforeClass public static void configureResourceTypes() { - List resTypes = new ArrayList<>( - ResourceUtils.getResourcesTypeInfo()); - resTypes.add(ResourceTypeInfo.newInstance(ResourceInformation.GPU_URI, "")); - ResourceUtils.reinitializeResources(resTypes); + try { + ResourceUtils.configureResourceType(ResourceUtils.GPU_URI); + } catch (SubmarineRuntimeException e) { + LOG.info("The hadoop dependency doesn't support gpu resource, " + + "so just skip this test case."); + } } @Rule diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/pytorch/TestRunJobCliParsingPyTorchYaml.java b/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/pytorch/TestRunJobCliParsingPyTorchYaml.java index 858c2de442..bf94edcf09 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/pytorch/TestRunJobCliParsingPyTorchYaml.java +++ b/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/pytorch/TestRunJobCliParsingPyTorchYaml.java @@ -24,29 +24,28 @@ import static org.junit.Assert.assertTrue; import java.io.File; -import java.util.ArrayList; import java.util.List; -import org.apache.hadoop.yarn.api.records.ResourceInformation; -import org.apache.hadoop.yarn.api.records.ResourceTypeInfo; -import org.apache.hadoop.yarn.resourcetypes.ResourceTypesTestHelper; +import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils; import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.PyTorchRunJobParameters; import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters; import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlParseException; import org.apache.hadoop.yarn.submarine.client.cli.runjob.RunJobCli; import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs; -import org.apache.hadoop.yarn.util.resource.ResourceUtils; +import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException; +import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils; +import org.apache.hadoop.yarn.util.resource.Resources; import org.junit.After; import org.junit.Assert; import org.junit.Before; -import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Test class that verifies the correctness of PyTorch @@ -56,6 +55,8 @@ public class TestRunJobCliParsingPyTorchYaml { private static final String OVERRIDDEN_PREFIX = "overridden_"; private static final String DIR_NAME = "runjob-pytorch-yaml"; private File yamlConfig; + private static Logger LOG = LoggerFactory.getLogger( + TestRunJobCliParsingPyTorchYaml.class); @Before public void before() { @@ -67,14 +68,6 @@ public void after() { YamlConfigTestUtils.deleteFile(yamlConfig); } - @BeforeClass - public static void configureResourceTypes() { - List resTypes = new ArrayList<>( - ResourceUtils.getResourcesTypeInfo()); - resTypes.add(ResourceTypeInfo.newInstance(ResourceInformation.GPU_URI, "")); - ResourceUtils.reinitializeResources(resTypes); - } - @Rule public ExpectedException exception = ExpectedException.none(); @@ -105,23 +98,38 @@ private void verifyBasicConfigValues(RunJobParameters jobRunParameters, } } - private void verifyWorkerValues(RunJobParameters jobRunParameters, - String prefix) { + private PyTorchRunJobParameters verifyWorkerCommonValues(RunJobParameters + jobRunParameters, String prefix) { assertTrue(RunJobParameters.class + " must be an instance of " + PyTorchRunJobParameters.class, jobRunParameters instanceof PyTorchRunJobParameters); - PyTorchRunJobParameters tensorFlowParams = + PyTorchRunJobParameters pyTorchParams = (PyTorchRunJobParameters) jobRunParameters; - assertEquals(3, tensorFlowParams.getNumWorkers()); + assertEquals(3, pyTorchParams.getNumWorkers()); assertEquals(prefix + "testLaunchCmdWorker", - tensorFlowParams.getWorkerLaunchCmd()); + pyTorchParams.getWorkerLaunchCmd()); assertEquals(prefix + "testDockerImageWorker", - tensorFlowParams.getWorkerDockerImage()); - assertEquals(ResourceTypesTestHelper.newResource(20480L, 32, - ImmutableMap. builder() - .put(ResourceInformation.GPU_URI, "2").build()), - tensorFlowParams.getWorkerResource()); + pyTorchParams.getWorkerDockerImage()); + return pyTorchParams; + } + + private void verifyWorkerValues(RunJobParameters jobRunParameters, + String prefix) { + PyTorchRunJobParameters pyTorchParams = verifyWorkerCommonValues + (jobRunParameters, prefix); + assertEquals(Resources.createResource(20480, 32), + pyTorchParams.getWorkerResource()); + } + + private void verifyWorkerValuesWithGpu(RunJobParameters jobRunParameters, + String prefix) { + + PyTorchRunJobParameters pyTorchParams = verifyWorkerCommonValues + (jobRunParameters, prefix); + Resource workResource = Resources.createResource(20480, 32); + ResourceUtils.setResource(workResource, ResourceUtils.GPU_URI, 2); + assertEquals(workResource, pyTorchParams.getWorkerResource()); } private void verifySecurityValues(RunJobParameters jobRunParameters) { @@ -146,6 +154,30 @@ public void testValidYamlParsing() throws Exception { verifySecurityValues(jobRunParameters); } + @Test + public void testValidGpuYamlParsing() throws Exception { + try { + ResourceUtils.configureResourceType(ResourceUtils.GPU_URI); + } catch (SubmarineRuntimeException e) { + LOG.info("The hadoop dependency doesn't support gpu resource, " + + "so just skip this test case."); + return; + } + + RunJobCli runJobCli = new RunJobCli(getMockClientContext()); + Assert.assertFalse(SubmarineLogs.isVerbose()); + + yamlConfig = YamlConfigTestUtils.createTempFileWithContents( + DIR_NAME + "/valid-gpu-config.yaml"); + runJobCli.run( + new String[] {"-f", yamlConfig.getAbsolutePath(), "--verbose"}); + + RunJobParameters jobRunParameters = runJobCli.getRunJobParameters(); + verifyBasicConfigValues(jobRunParameters); + verifyWorkerValuesWithGpu(jobRunParameters, ""); + verifySecurityValues(jobRunParameters); + } + @Test public void testRoleOverrides() throws Exception { RunJobCli runJobCli = new RunJobCli(getMockClientContext()); diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/tensorflow/TestRunJobCliParsingTensorFlowYaml.java b/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/tensorflow/TestRunJobCliParsingTensorFlowYaml.java index c3b2accc82..9c69720e5b 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/tensorflow/TestRunJobCliParsingTensorFlowYaml.java +++ b/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/tensorflow/TestRunJobCliParsingTensorFlowYaml.java @@ -18,26 +18,25 @@ package org.apache.hadoop.yarn.submarine.client.cli.runjob.tensorflow; import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import org.apache.hadoop.yarn.api.records.ResourceInformation; -import org.apache.hadoop.yarn.api.records.ResourceTypeInfo; -import org.apache.hadoop.yarn.resourcetypes.ResourceTypesTestHelper; +import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils; import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters; import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters; import org.apache.hadoop.yarn.submarine.client.cli.runjob.RunJobCli; import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs; -import org.apache.hadoop.yarn.util.resource.ResourceUtils; +import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException; +import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils; +import org.apache.hadoop.yarn.util.resource.Resources; import org.junit.After; import org.junit.Assert; import org.junit.Before; -import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.File; -import java.util.ArrayList; import java.util.List; import static org.apache.hadoop.yarn.submarine.client.cli.runjob.TestRunJobCliParsingCommon.getMockClientContext; @@ -53,6 +52,8 @@ public class TestRunJobCliParsingTensorFlowYaml { private static final String OVERRIDDEN_PREFIX = "overridden_"; private static final String DIR_NAME = "runjob-tensorflow-yaml"; private File yamlConfig; + private static Logger LOG = LoggerFactory.getLogger( + TestRunJobCliParsingTensorFlowYaml.class); @Before public void before() { @@ -64,14 +65,6 @@ public void after() { YamlConfigTestUtils.deleteFile(yamlConfig); } - @BeforeClass - public static void configureResourceTypes() { - List resTypes = new ArrayList<>( - ResourceUtils.getResourcesTypeInfo()); - resTypes.add(ResourceTypeInfo.newInstance(ResourceInformation.GPU_URI, "")); - ResourceUtils.reinitializeResources(resTypes); - } - @Rule public ExpectedException exception = ExpectedException.none(); @@ -114,14 +107,12 @@ private void verifyPsValues(RunJobParameters jobRunParameters, assertEquals(prefix + "testLaunchCmdPs", tensorFlowParams.getPSLaunchCmd()); assertEquals(prefix + "testDockerImagePs", tensorFlowParams.getPsDockerImage()); - assertEquals(ResourceTypesTestHelper.newResource(20500L, 34, - ImmutableMap. builder() - .put(ResourceInformation.GPU_URI, "4").build()), + assertEquals(Resources.createResource(20500, 34), tensorFlowParams.getPsResource()); } - private void verifyWorkerValues(RunJobParameters jobRunParameters, - String prefix) { + private TensorFlowRunJobParameters verifyWorkerCommonValues( + RunJobParameters jobRunParameters, String prefix) { assertTrue(RunJobParameters.class + " must be an instance of " + TensorFlowRunJobParameters.class, jobRunParameters instanceof TensorFlowRunJobParameters); @@ -133,12 +124,26 @@ private void verifyWorkerValues(RunJobParameters jobRunParameters, tensorFlowParams.getWorkerLaunchCmd()); assertEquals(prefix + "testDockerImageWorker", tensorFlowParams.getWorkerDockerImage()); - assertEquals(ResourceTypesTestHelper.newResource(20480L, 32, - ImmutableMap. builder() - .put(ResourceInformation.GPU_URI, "2").build()), + return tensorFlowParams; + } + + private void verifyWorkerValues(RunJobParameters jobRunParameters, + String prefix) { + TensorFlowRunJobParameters tensorFlowParams = verifyWorkerCommonValues + (jobRunParameters, prefix); + assertEquals(Resources.createResource(20480, 32), tensorFlowParams.getWorkerResource()); } + private void verifyWorkerValuesWithGpu(RunJobParameters jobRunParameters, + String prefix) { + TensorFlowRunJobParameters tensorFlowParams = verifyWorkerCommonValues + (jobRunParameters, prefix); + Resource workResource = Resources.createResource(20480, 32); + ResourceUtils.setResource(workResource, ResourceUtils.GPU_URI, 2); + assertEquals(workResource, tensorFlowParams.getWorkerResource()); + } + private void verifySecurityValues(RunJobParameters jobRunParameters) { assertEquals("keytabPath", jobRunParameters.getKeytab()); assertEquals("testPrincipal", jobRunParameters.getPrincipal()); @@ -155,9 +160,7 @@ private void verifyTensorboardValues(RunJobParameters jobRunParameters) { assertTrue(tensorFlowParams.isTensorboardEnabled()); assertEquals("tensorboardDockerImage", tensorFlowParams.getTensorboardDockerImage()); - assertEquals(ResourceTypesTestHelper.newResource(21000L, 37, - ImmutableMap. builder() - .put(ResourceInformation.GPU_URI, "3").build()), + assertEquals(Resources.createResource(21000, 37), tensorFlowParams.getTensorboardResource()); } @@ -179,6 +182,32 @@ public void testValidYamlParsing() throws Exception { verifyTensorboardValues(jobRunParameters); } + @Test + public void testValidGpuYamlParsing() throws Exception { + try { + ResourceUtils.configureResourceType(ResourceUtils.GPU_URI); + } catch (SubmarineRuntimeException e) { + LOG.info("The hadoop dependency doesn't support gpu resource, " + + "so just skip this test case."); + return; + } + + RunJobCli runJobCli = new RunJobCli(getMockClientContext()); + Assert.assertFalse(SubmarineLogs.isVerbose()); + + yamlConfig = YamlConfigTestUtils.createTempFileWithContents( + DIR_NAME + "/valid-gpu-config.yaml"); + runJobCli.run( + new String[] {"-f", yamlConfig.getAbsolutePath(), "--verbose"}); + + RunJobParameters jobRunParameters = runJobCli.getRunJobParameters(); + verifyBasicConfigValues(jobRunParameters); + verifyPsValues(jobRunParameters, ""); + verifyWorkerValuesWithGpu(jobRunParameters, ""); + verifySecurityValues(jobRunParameters); + verifyTensorboardValues(jobRunParameters); + } + @Test public void testRoleOverrides() throws Exception { RunJobCli runJobCli = new RunJobCli(getMockClientContext()); @@ -240,9 +269,7 @@ public void testMissingTensorBoardDockerImage() throws Exception { assertTrue(tensorFlowParams.isTensorboardEnabled()); assertNull("tensorboardDockerImage should be null!", tensorFlowParams.getTensorboardDockerImage()); - assertEquals(ResourceTypesTestHelper.newResource(21000L, 37, - ImmutableMap. builder() - .put(ResourceInformation.GPU_URI, "3").build()), + assertEquals(Resources.createResource(21000, 37), tensorFlowParams.getTensorboardResource()); } diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/tensorflow/TestRunJobCliParsingTensorFlowYamlStandalone.java b/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/tensorflow/TestRunJobCliParsingTensorFlowYamlStandalone.java index 59b0b59f61..2088d1df83 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/tensorflow/TestRunJobCliParsingTensorFlowYamlStandalone.java +++ b/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/tensorflow/TestRunJobCliParsingTensorFlowYamlStandalone.java @@ -132,14 +132,14 @@ private void assertRoleConfigOverrides(Role role, String prefix, private void assertWorkerValues(Role worker) { assertEquals("testLaunchCmdWorker", worker.getLaunchCmd()); assertEquals("testDockerImageWorker", worker.getDockerImage()); - assertEquals("memory=20480M,vcores=32,gpu=2", worker.getResources()); + assertEquals("memory=20480M,vcores=32", worker.getResources()); assertEquals(3, worker.getReplicas()); } private void assertPsValues(Role ps) { assertEquals("testLaunchCmdPs", ps.getLaunchCmd()); assertEquals("testDockerImagePs", ps.getDockerImage()); - assertEquals("memory=20500M,vcores=34,gpu=4", ps.getResources()); + assertEquals("memory=20500M,vcores=34", ps.getResources()); assertEquals(4, ps.getReplicas()); } @@ -161,7 +161,7 @@ private void verifyTensorboardValues(YamlConfigFile yamlConfigFile) { TensorBoard tensorBoard = yamlConfigFile.getTensorBoard(); assertNotNull("Tensorboard should not be null!", tensorBoard); assertEquals("tensorboardDockerImage", tensorBoard.getDockerImage()); - assertEquals("memory=21000M,vcores=37,gpu=3", tensorBoard.getResources()); + assertEquals("memory=21000M,vcores=37", tensorBoard.getResources()); } @Before diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/common/MockClientContext.java b/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/common/MockClientContext.java index 23c45d20e3..3740f8677d 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/common/MockClientContext.java +++ b/hadoop-submarine/hadoop-submarine-core/src/test/java/org/apache/hadoop/yarn/submarine/common/MockClientContext.java @@ -20,15 +20,6 @@ import org.apache.hadoop.yarn.submarine.common.fs.MockRemoteDirectoryManager; import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager; -import org.apache.hadoop.yarn.client.api.YarnClient; -import org.apache.hadoop.yarn.exceptions.YarnException; -import org.apache.hadoop.yarn.util.resource.ResourceUtils; - -import java.io.IOException; - -import static org.junit.Assert.fail; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; public class MockClientContext extends ClientContext { @@ -44,18 +35,4 @@ public void setRemoteDirectoryMgr( RemoteDirectoryManager remoteDirectoryMgr) { this.remoteDirectoryMgr = remoteDirectoryMgr; } - - @Override - public synchronized YarnClient getOrCreateYarnClient() { - YarnClient client = mock(YarnClient.class); - try { - when(client.getResourceTypeInfo()).thenReturn( - ResourceUtils.getResourcesTypeInfo()); - } catch (YarnException e) { - fail(e.getMessage()); - } catch (IOException e) { - fail(e.getMessage()); - } - return client; - } } diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/empty-framework.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/empty-framework.yaml index 0d9577bac9..e3c15a4f77 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/empty-framework.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/empty-framework.yaml @@ -43,12 +43,12 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker docker_image: testDockerImageWorker ps: - resources: memory=20500M,vcores=34,gpu=4 + resources: memory=20500M,vcores=34 replicas: 4 launch_cmd: testLaunchCmdPs docker_image: testDockerImagePs @@ -59,5 +59,5 @@ security: distribute_keytab: true tensorBoard: - resources: memory=21000M,vcores=37,gpu=3 + resources: memory=21000M,vcores=37 docker_image: tensorboardDockerImage \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/missing-configs.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/missing-configs.yaml index db19dc2bed..6fa8ed45d4 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/missing-configs.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/missing-configs.yaml @@ -23,11 +23,11 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker ps: - resources: memory=20500M,vcores=34,gpu=4 + resources: memory=20500M,vcores=34 replicas: 4 launch_cmd: testLaunchCmdPs @@ -37,5 +37,5 @@ security: distribute_keytab: true tensorBoard: - resources: memory=21000M,vcores=37,gpu=3 + resources: memory=21000M,vcores=37 docker_image: tensorboardDockerImage \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/missing-framework.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/missing-framework.yaml index be67fbb0bc..6523a38538 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/missing-framework.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/missing-framework.yaml @@ -42,12 +42,12 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker docker_image: testDockerImageWorker ps: - resources: memory=20500M,vcores=34,gpu=4 + resources: memory=20500M,vcores=34 replicas: 4 launch_cmd: testLaunchCmdPs docker_image: testDockerImagePs @@ -58,5 +58,5 @@ security: distribute_keytab: true tensorBoard: - resources: memory=21000M,vcores=37,gpu=3 + resources: memory=21000M,vcores=37 docker_image: tensorboardDockerImage \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/some-sections-missing.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/some-sections-missing.yaml index 9c7a56fb16..38505376d7 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/some-sections-missing.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/some-sections-missing.yaml @@ -40,10 +40,10 @@ configs: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker ps: - resources: memory=20500M,vcores=34,gpu=4 + resources: memory=20500M,vcores=34 replicas: 4 launch_cmd: testLaunchCmdPs \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/test-false-values.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/test-false-values.yaml index 8a97c49f47..0e300a0fe9 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/test-false-values.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/test-false-values.yaml @@ -43,11 +43,11 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker ps: - resources: memory=20500M,vcores=34,gpu=4 + resources: memory=20500M,vcores=34 replicas: 4 launch_cmd: testLaunchCmdPs diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/wrong-indentation.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/wrong-indentation.yaml index 66f0e16a9f..375c680330 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/wrong-indentation.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/wrong-indentation.yaml @@ -42,11 +42,11 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker ps: - resources: memory=20500M,vcores=34,gpu=4 + resources: memory=20500M,vcores=34 replicas: 4 launch_cmd: testLaunchCmdPs @@ -56,5 +56,5 @@ security: distribute_keytab: true tensorBoard: - resources: memory=21000M,vcores=37,gpu=3 + resources: memory=21000M,vcores=37 docker_image: tensorboardDockerImage \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/wrong-property-name.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/wrong-property-name.yaml index 41b5f2b08c..6ea9c37c2a 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/wrong-property-name.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-common-yaml/wrong-property-name.yaml @@ -42,11 +42,11 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker ps: - resources: memory=20500M,vcores=34,gpu=4 + resources: memory=20500M,vcores=34 replicas: 4 launch_cmd: testLaunchCmdPs @@ -56,5 +56,5 @@ security: distribute_keytab: true tensorBoard: - resources: memory=21000M,vcores=37,gpu=3 + resources: memory=21000M,vcores=37 docker_image: tensorboardDockerImage \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/envs-are-missing.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/envs-are-missing.yaml index d5b82feffc..758be1e154 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/envs-are-missing.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/envs-are-missing.yaml @@ -40,7 +40,7 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker docker_image: testDockerImageWorker diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/invalid-config-ps-section.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/invalid-config-ps-section.yaml index a48160efd6..a161e61a8e 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/invalid-config-ps-section.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/invalid-config-ps-section.yaml @@ -43,7 +43,7 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker docker_image: testDockerImageWorker diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/invalid-config-tensorboard-section.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/invalid-config-tensorboard-section.yaml index 5acc2f2001..675fba5bbb 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/invalid-config-tensorboard-section.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/invalid-config-tensorboard-section.yaml @@ -43,7 +43,7 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker docker_image: testDockerImageWorker diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/security-principal-is-missing.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/security-principal-is-missing.yaml index 31868b66eb..5f6c25661b 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/security-principal-is-missing.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/security-principal-is-missing.yaml @@ -43,7 +43,7 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker docker_image: testDockerImageWorker diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/valid-config-with-overrides.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/valid-config-with-overrides.yaml index 4a1e2c8312..71ac275e31 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/valid-config-with-overrides.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/valid-config-with-overrides.yaml @@ -43,7 +43,7 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: overridden_testLaunchCmdWorker docker_image: overridden_testDockerImageWorker diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/valid-config.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/valid-config.yaml index a494e9c7e7..2b9dd50ba3 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/valid-config.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/valid-config.yaml @@ -43,7 +43,7 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker docker_image: testDockerImageWorker diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/valid-gpu-config.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/valid-gpu-config.yaml new file mode 100644 index 0000000000..a494e9c7e7 --- /dev/null +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-pytorch-yaml/valid-gpu-config.yaml @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +spec: + name: testJobName + job_type: testJobType + framework: pytorch + +configs: + input_path: testInputPath + checkpoint_path: testCheckpointPath + saved_model_path: testSavedModelPath + docker_image: testDockerImage + wait_job_finish: true + envs: + env1: 'env1Value' + env2: 'env2Value' + localizations: + - hdfs://remote-file1:/local-filename1:rw + - nfs://remote-file2:/local-filename2:rw + mounts: + - /etc/passwd:/etc/passwd:rw + - /etc/hosts:/etc/hosts:rw + quicklinks: + - Notebook_UI=https://master-0:7070 + - Notebook_UI2=https://master-0:7071 + +scheduling: + queue: queue1 + +roles: + worker: + resources: memory=20480M,vcores=32,gpu=2 + replicas: 3 + launch_cmd: testLaunchCmdWorker + docker_image: testDockerImageWorker + +security: + keytab: keytabPath + principal: testPrincipal + distribute_keytab: true \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/envs-are-missing.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/envs-are-missing.yaml index 4625dad0fc..4deb962ecb 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/envs-are-missing.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/envs-are-missing.yaml @@ -40,12 +40,12 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker docker_image: testDockerImageWorker ps: - resources: memory=20500M,vcores=34,gpu=4 + resources: memory=20500M,vcores=34 replicas: 4 launch_cmd: testLaunchCmdPs docker_image: testDockerImagePs @@ -56,5 +56,5 @@ security: distribute_keytab: true tensorBoard: - resources: memory=21000M,vcores=37,gpu=3 + resources: memory=21000M,vcores=37 docker_image: tensorboardDockerImage \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/security-principal-is-missing.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/security-principal-is-missing.yaml index d67ef3b369..4341923206 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/security-principal-is-missing.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/security-principal-is-missing.yaml @@ -43,12 +43,12 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker docker_image: testDockerImageWorker ps: - resources: memory=20500M,vcores=34,gpu=4 + resources: memory=20500M,vcores=34 replicas: 4 launch_cmd: testLaunchCmdPs docker_image: testDockerImagePs @@ -58,5 +58,5 @@ security: distribute_keytab: true tensorBoard: - resources: memory=21000M,vcores=37,gpu=3 + resources: memory=21000M,vcores=37 docker_image: tensorboardDockerImage \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/tensorboard-dockerimage-is-missing.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/tensorboard-dockerimage-is-missing.yaml index c6b2d708c8..cdefb9fa5e 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/tensorboard-dockerimage-is-missing.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/tensorboard-dockerimage-is-missing.yaml @@ -43,12 +43,12 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker docker_image: testDockerImageWorker ps: - resources: memory=20500M,vcores=34,gpu=4 + resources: memory=20500M,vcores=34 replicas: 4 launch_cmd: testLaunchCmdPs docker_image: testDockerImagePs @@ -59,4 +59,4 @@ security: distribute_keytab: true tensorBoard: - resources: memory=21000M,vcores=37,gpu=3 \ No newline at end of file + resources: memory=21000M,vcores=37 diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/valid-config-with-overrides.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/valid-config-with-overrides.yaml index 7b84055637..042bf357b9 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/valid-config-with-overrides.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/valid-config-with-overrides.yaml @@ -43,7 +43,7 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: overridden_testLaunchCmdWorker docker_image: overridden_testDockerImageWorker @@ -58,7 +58,7 @@ roles: - /etc/hosts:/overridden_Worker ps: - resources: memory=20500M,vcores=34,gpu=4 + resources: memory=20500M,vcores=34 replicas: 4 launch_cmd: overridden_testLaunchCmdPs docker_image: overridden_testDockerImagePs @@ -78,5 +78,5 @@ security: distribute_keytab: true tensorBoard: - resources: memory=21000M,vcores=37,gpu=3 + resources: memory=21000M,vcores=37 docker_image: tensorboardDockerImage \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/valid-config.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/valid-config.yaml index 1bbaf58492..7e312004ad 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/valid-config.yaml +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/valid-config.yaml @@ -43,12 +43,12 @@ scheduling: roles: worker: - resources: memory=20480M,vcores=32,gpu=2 + resources: memory=20480M,vcores=32 replicas: 3 launch_cmd: testLaunchCmdWorker docker_image: testDockerImageWorker ps: - resources: memory=20500M,vcores=34,gpu=4 + resources: memory=20500M,vcores=34 replicas: 4 launch_cmd: testLaunchCmdPs docker_image: testDockerImagePs @@ -59,5 +59,5 @@ security: distribute_keytab: true tensorBoard: - resources: memory=21000M,vcores=37,gpu=3 + resources: memory=21000M,vcores=37 docker_image: tensorboardDockerImage \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/valid-gpu-config.yaml b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/valid-gpu-config.yaml new file mode 100644 index 0000000000..1ef1df091a --- /dev/null +++ b/hadoop-submarine/hadoop-submarine-core/src/test/resources/runjob-tensorflow-yaml/valid-gpu-config.yaml @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +spec: + name: testJobName + job_type: testJobType + framework: tensorflow + +configs: + input_path: testInputPath + checkpoint_path: testCheckpointPath + saved_model_path: testSavedModelPath + docker_image: testDockerImage + wait_job_finish: true + envs: + env1: 'env1Value' + env2: 'env2Value' + localizations: + - hdfs://remote-file1:/local-filename1:rw + - nfs://remote-file2:/local-filename2:rw + mounts: + - /etc/passwd:/etc/passwd:rw + - /etc/hosts:/etc/hosts:rw + quicklinks: + - Notebook_UI=https://master-0:7070 + - Notebook_UI2=https://master-0:7071 + +scheduling: + queue: queue1 + +roles: + worker: + resources: memory=20480M,vcores=32,gpu=2 + replicas: 3 + launch_cmd: testLaunchCmdWorker + docker_image: testDockerImageWorker + ps: + resources: memory=20500M,vcores=34 + replicas: 4 + launch_cmd: testLaunchCmdPs + docker_image: testDockerImagePs + +security: + keytab: keytabPath + principal: testPrincipal + distribute_keytab: true + +tensorBoard: + resources: memory=21000M,vcores=37 + docker_image: tensorboardDockerImage \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-dist/pom.xml b/hadoop-submarine/hadoop-submarine-dist/pom.xml new file mode 100644 index 0000000000..423fd0b6bc --- /dev/null +++ b/hadoop-submarine/hadoop-submarine-dist/pom.xml @@ -0,0 +1,131 @@ + + + + 4.0.0 + + hadoop-submarine + org.apache.hadoop + 0.2.0-SNAPSHOT + + ${project.artifactId} + ${project.version} + Hadoop Submarine Dist + pom + + + + ${project.parent.parent.basedir} + hadoop-submarine-dist + 0.2.0-SNAPSHOT + + + + + org.apache.hadoop + hadoop-submarine-core + ${project.version} + + + + + + hadoop-3.2 + + + org.apache.hadoop + hadoop-submarine-yarnservice-runtime + ${project.version} + + + org.apache.hadoop + hadoop-submarine-tony-runtime + ${project.version} + + + + + + + hadoop-3.1 + + true + + + + org.apache.hadoop + hadoop-submarine-yarnservice-runtime + ${project.version} + + + org.apache.hadoop + hadoop-submarine-tony-runtime + ${project.version} + + + + + + hadoop-2.9 + + + org.apache.hadoop + hadoop-submarine-tony-runtime + ${project.version} + + + + + + hadoop-2.7 + + + org.apache.hadoop + hadoop-submarine-tony-runtime + ${project.version} + + + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + dist + package + + single + + + ${project.artifactId}-${project.version}-${project.activeProfiles[0].id} + false + false + + src/assembly/distribution.xml + + + + + + + + + diff --git a/hadoop-submarine/hadoop-submarine-dist/src/assembly/distribution.xml b/hadoop-submarine/hadoop-submarine-dist/src/assembly/distribution.xml new file mode 100644 index 0000000000..c6e1c255a8 --- /dev/null +++ b/hadoop-submarine/hadoop-submarine-dist/src/assembly/distribution.xml @@ -0,0 +1,61 @@ + + + + distribution + + dir + tar.gz + + + + + / + + org.apache.hadoop:hadoop-submarine-core + + + + / + + org.apache.hadoop:hadoop-submarine-tony-runtime + + + + / + + org.apache.hadoop:hadoop-submarine-yarnservice-runtime + + + + + + + ../../ + + LICENSE* + NOTICE* + + + + ../hadoop-submarine-all/target/ + / + + hadoop-submarine-all-${project.version}-*.jar + + + + diff --git a/hadoop-submarine/hadoop-submarine-tony-runtime/pom.xml b/hadoop-submarine/hadoop-submarine-tony-runtime/pom.xml index 8dbda98183..a0042018aa 100644 --- a/hadoop-submarine/hadoop-submarine-tony-runtime/pom.xml +++ b/hadoop-submarine/hadoop-submarine-tony-runtime/pom.xml @@ -23,6 +23,7 @@ 4.0.0 hadoop-submarine-tony-runtime + Hadoop Submarine Tony Runtime org.apache.hadoop diff --git a/hadoop-submarine/hadoop-submarine-tony-runtime/src/main/java/org/apache/hadoop/yarn/submarine/runtimes/tony/TonyUtils.java b/hadoop-submarine/hadoop-submarine-tony-runtime/src/main/java/org/apache/hadoop/yarn/submarine/runtimes/tony/TonyUtils.java index 47ad1a4c93..d818fe0d1f 100644 --- a/hadoop-submarine/hadoop-submarine-tony-runtime/src/main/java/org/apache/hadoop/yarn/submarine/runtimes/tony/TonyUtils.java +++ b/hadoop-submarine/hadoop-submarine-tony-runtime/src/main/java/org/apache/hadoop/yarn/submarine/runtimes/tony/TonyUtils.java @@ -19,9 +19,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.yarn.api.records.ResourceInformation; -import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException; import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters; +import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils; import java.util.ArrayList; import java.util.Arrays; @@ -52,7 +51,7 @@ public static Configuration tonyConfFromClientContext( tonyConf.setLong( TonyConfigurationKeys.getResourceKey(Constants.PS_JOB_NAME, Constants.MEMORY), - parameters.getPsResource().getMemorySize()); + ResourceUtils.getMemorySize(parameters.getPsResource())); } if (parameters.getWorkerResource() != null) { tonyConf.setInt( @@ -62,16 +61,12 @@ public static Configuration tonyConfFromClientContext( tonyConf.setLong( TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME, Constants.MEMORY), - parameters.getWorkerResource().getMemorySize()); - try { - tonyConf.setLong( - TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME, - Constants.GPUS), - parameters.getWorkerResource() - .getResourceValue(ResourceInformation.GPU_URI)); - } catch (ResourceNotFoundException rnfe) { - LOG.error("GPU resources not enabled."); - } + ResourceUtils.getMemorySize(parameters.getWorkerResource())); + tonyConf.setLong( + TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME, + Constants.GPUS), + ResourceUtils.getResourceValue(parameters.getWorkerResource(), + ResourceUtils.GPU_URI)); } if (parameters.getQueue() != null) { tonyConf.set( diff --git a/hadoop-submarine/hadoop-submarine-yarnservice-runtime/pom.xml b/hadoop-submarine/hadoop-submarine-yarnservice-runtime/pom.xml index 15dffb95e7..3609b5ac9c 100644 --- a/hadoop-submarine/hadoop-submarine-yarnservice-runtime/pom.xml +++ b/hadoop-submarine/hadoop-submarine-yarnservice-runtime/pom.xml @@ -22,7 +22,7 @@ org.apache.hadoop 0.2.0-SNAPSHOT - hadoop-submarine-score-yarnservice-runtime + hadoop-submarine-yarnservice-runtime 0.2.0-SNAPSHOT Hadoop Submarine YARN Service Runtime @@ -108,12 +108,10 @@ org.apache.hadoop hadoop-yarn-services-api - 3.3.0-SNAPSHOT org.apache.hadoop hadoop-yarn-services-core - 3.3.0-SNAPSHOT org.apache.hadoop diff --git a/hadoop-submarine/hadoop-submarine-yarnservice-runtime/src/main/java/org/apache/hadoop/yarn/submarine/runtimes/yarnservice/AbstractComponent.java b/hadoop-submarine/hadoop-submarine-yarnservice-runtime/src/main/java/org/apache/hadoop/yarn/submarine/runtimes/yarnservice/AbstractComponent.java index 40154223d1..0e0bc751b1 100644 --- a/hadoop-submarine/hadoop-submarine-yarnservice-runtime/src/main/java/org/apache/hadoop/yarn/submarine/runtimes/yarnservice/AbstractComponent.java +++ b/hadoop-submarine/hadoop-submarine-yarnservice-runtime/src/main/java/org/apache/hadoop/yarn/submarine/runtimes/yarnservice/AbstractComponent.java @@ -30,7 +30,6 @@ import java.io.IOException; import java.util.Objects; -import static org.apache.hadoop.yarn.service.conf.YarnServiceConstants.CONTAINER_STATE_REPORT_AS_SERVICE_STATE; import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.addCommonEnvironments; import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getScriptFileName; import static org.apache.hadoop.yarn.submarine.utils.DockerUtilities.getDockerArtifact; @@ -85,8 +84,11 @@ protected Component createComponentInternal() throws IOException { if (role.equals(TensorFlowRole.PRIMARY_WORKER) || role.equals(PyTorchRole.PRIMARY_WORKER)) { component.setNumberOfContainers(1L); + // If the dependencies are upgraded to hadoop 3.3.0. + // yarn.service.container-state-report-as-service-state can be replaced + // with CONTAINER_STATE_REPORT_AS_SERVICE_STATE component.getConfiguration().setProperty( - CONTAINER_STATE_REPORT_AS_SERVICE_STATE, "true"); + "yarn.service.container-state-report-as-service-state", "true"); } else { component.setNumberOfContainers( (long) parameters.getNumWorkers() - 1); diff --git a/hadoop-submarine/hadoop-submarine-yarnservice-runtime/src/test/java/org/apache/hadoop/yarn/submarine/runtimes/yarnservice/tensorflow/component/TestTensorFlowWorkerComponent.java b/hadoop-submarine/hadoop-submarine-yarnservice-runtime/src/test/java/org/apache/hadoop/yarn/submarine/runtimes/yarnservice/tensorflow/component/TestTensorFlowWorkerComponent.java index c17757f7fb..d75aff9057 100644 --- a/hadoop-submarine/hadoop-submarine-yarnservice-runtime/src/test/java/org/apache/hadoop/yarn/submarine/runtimes/yarnservice/tensorflow/component/TestTensorFlowWorkerComponent.java +++ b/hadoop-submarine/hadoop-submarine-yarnservice-runtime/src/test/java/org/apache/hadoop/yarn/submarine/runtimes/yarnservice/tensorflow/component/TestTensorFlowWorkerComponent.java @@ -33,7 +33,6 @@ import java.util.Map; import static junit.framework.TestCase.assertTrue; -import static org.apache.hadoop.yarn.service.conf.YarnServiceConstants.CONTAINER_STATE_REPORT_AS_SERVICE_STATE; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.mockito.ArgumentMatchers.any; @@ -209,8 +208,11 @@ public void testPrimaryWorkerComponentNumWorkersIsTwo() throws IOException { Component component = workerComponent.createComponent(); assertEquals(1L, (long) component.getNumberOfContainers()); + // If the dependencies are upgraded to hadoop 3.3.0. + // yarn.service.container-state-report-as-service-state can be replaced + // with CONTAINER_STATE_REPORT_AS_SERVICE_STATE verifyCommons(component, ImmutableMap.of( - CONTAINER_STATE_REPORT_AS_SERVICE_STATE, "true")); + "yarn.service.container-state-report-as-service-state", "true")); } } \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-yarnservice-runtime/src/test/java/org/apache/hadoop/yarn/submarine/utils/TestSubmarineResourceUtils.java b/hadoop-submarine/hadoop-submarine-yarnservice-runtime/src/test/java/org/apache/hadoop/yarn/submarine/utils/TestSubmarineResourceUtils.java index f22fbaaa2b..1c0a73b8dc 100644 --- a/hadoop-submarine/hadoop-submarine-yarnservice-runtime/src/test/java/org/apache/hadoop/yarn/submarine/utils/TestSubmarineResourceUtils.java +++ b/hadoop-submarine/hadoop-submarine-yarnservice-runtime/src/test/java/org/apache/hadoop/yarn/submarine/utils/TestSubmarineResourceUtils.java @@ -19,12 +19,17 @@ import com.google.common.collect.ImmutableMap; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.LocalConfigurationProvider; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.service.api.records.ResourceInformation; -import org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationProvider; import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.junit.After; import org.junit.Test; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; import java.util.Map; import static org.junit.Assert.*; @@ -33,13 +38,48 @@ * This class is to test {@link SubmarineResourceUtils}. */ public class TestSubmarineResourceUtils { + /** + * With the dependencies of hadoop 3.2.0, Need to create a + * CustomResourceTypesConfigurationProvider implementations. If the + * dependencies are upgraded to hadoop 3.3.0. It can be replaced by + * org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationProvi- + * der + */ + private static class CustomResourceTypesConfigurationProvider + extends LocalConfigurationProvider { + + @Override + public InputStream getConfigurationInputStream(Configuration bootstrapConf, + String name) throws YarnException, IOException { + if (YarnConfiguration.RESOURCE_TYPES_CONFIGURATION_FILE.equals(name)) { + return new ByteArrayInputStream( + ("\n" + + " \n" + + " yarn.resource-types\n" + + " " + CUSTOM_RESOURCE_NAME + "\n" + + " \n" + + " \n" + + " yarn.resource-types.a-custom-resource.units\n" + + + " G\n" + + " \n" + + "\n").getBytes()); + } else { + return super.getConfigurationInputStream(bootstrapConf, name); + } + } + } + private static final String CUSTOM_RESOURCE_NAME = "a-custom-resource"; private void initResourceTypes() { - CustomResourceTypesConfigurationProvider.initResourceTypes( - ImmutableMap.builder() - .put(CUSTOM_RESOURCE_NAME, "G") - .build()); + // If the dependencies are upgraded to hadoop 3.3.0. It can be replaced by + // org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationPro- + // vider + Configuration configuration = new Configuration(); + configuration.set(YarnConfiguration.RM_CONFIGURATION_PROVIDER_CLASS, + CustomResourceTypesConfigurationProvider.class.getName()); + ResourceUtils.resetResourceTypes(configuration); } @After diff --git a/hadoop-submarine/pom.xml b/hadoop-submarine/pom.xml index 1f44556905..951e9d3b12 100644 --- a/hadoop-submarine/pom.xml +++ b/hadoop-submarine/pom.xml @@ -20,7 +20,7 @@ org.apache.hadoop hadoop-project - 3.3.0-SNAPSHOT + 3.2.0 hadoop-submarine @@ -32,29 +32,126 @@ ${basedir}/../hadoop-common-project/hadoop-common/target - - hadoop-submarine-core - hadoop-submarine-yarnservice-runtime - hadoop-submarine-tony-runtime + hadoop-submarine-all + hadoop-submarine-dist - - - clover - - false - - clover - - + - com.cenqua.clover - clover + org.mockito + mockito-core + 2.23.4 + + + + org.apache.hadoop + hadoop-yarn-services-api + ${hadoop.version} + + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + + + + org.apache.hadoop + hadoop-yarn-api + ${hadoop.version} + + + + org.apache.hadoop + hadoop-yarn-common + ${hadoop.version} + + + + org.apache.hadoop + hadoop-yarn-client + ${hadoop.version} + + + + org.apache.hadoop + hadoop-hdfs + ${hadoop.version} + + + + org.apache.commons + commons-lang3 + 3.7 - - + + + + + hadoop-3.2 + + 3.2.0 + + + hadoop-submarine-yarnservice-runtime + hadoop-submarine-tony-runtime + + + + + + hadoop-3.1 + + true + + + 3.1.2 + + + hadoop-submarine-yarnservice-runtime + hadoop-submarine-tony-runtime + + + + + hadoop-2.9 + + 2.9.2 + + + hadoop-submarine-tony-runtime + + + + + hadoop-2.7 + + 2.7.3 + + + hadoop-submarine-tony-runtime + + + + + clover + + false + + clover + + + + + com.cenqua.clover + clover + + + + +