SUBMARINE-58. Submarine client needs to generate fat jar. Contributed by Zac Zhou.

This commit is contained in:
Zhankun Tang 2019-05-19 21:18:33 +08:00
parent 732133cb2a
commit 729ccb2cab
42 changed files with 1424 additions and 183 deletions

View File

@ -0,0 +1,183 @@
<?xml version="1.0"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<artifactId>hadoop-submarine</artifactId>
<groupId>org.apache.hadoop</groupId>
<version>0.2.0-SNAPSHOT</version>
</parent>
<artifactId>${project.artifactId}</artifactId>
<version>${project.version}</version>
<name>Hadoop Submarine All</name>
<properties>
<!-- Needed for generating FindBugs warnings using parent pom -->
<yarn.basedir>${project.parent.parent.basedir}</yarn.basedir>
<project.artifactId>hadoop-submarine-all</project.artifactId>
<project.version>0.2.0-SNAPSHOT</project.version>
</properties>
<dependencies>
<!-- Dependencies for Hadoop commons -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-submarine-core</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
<profiles>
<profile>
<id>hadoop-3.2</id>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-submarine-tony-runtime</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
</profile>
<!-- Default profile-->
<profile>
<id>hadoop-3.1</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-submarine-tony-runtime</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
</profile>
<profile>
<id>hadoop-2.9</id>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-submarine-tony-runtime</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
</profile>
<profile>
<id>hadoop-2.7</id>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-submarine-tony-runtime</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</profile>
</profiles>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<!--
<shadedArtifactAttached>true</shadedArtifactAttached>
<shadedClassifierName>with-all-dependencies</shadedClassifierName>
-->
<outputFile>target/${project.artifactId}-${project.version}-${project.activeProfiles[0].id}.jar</outputFile>
<artifactSet>
<excludes>
<exclude>classworlds:classworlds</exclude>
<exclude>junit:junit</exclude>
<exclude>jmock:*</exclude>
<exclude>*:xml-apis</exclude>
<exclude>org.apache.maven:lib:tests</exclude>
</excludes>
</artifactSet>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>org.apache.hadoop.yarn.submarine.client.cli.Cli</mainClass>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -67,6 +67,10 @@
<groupId>org.yaml</groupId> <groupId>org.yaml</groupId>
<artifactId>snakeyaml</artifactId> <artifactId>snakeyaml</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<!-- Dependencies for Hadoop commons --> <!-- Dependencies for Hadoop commons -->

View File

@ -45,7 +45,7 @@ import org.apache.hadoop.yarn.submarine.client.cli.runjob.RoleParameters;
import org.apache.hadoop.yarn.submarine.common.ClientContext; import org.apache.hadoop.yarn.submarine.common.ClientContext;
import org.apache.hadoop.yarn.submarine.common.api.TensorFlowRole; import org.apache.hadoop.yarn.submarine.common.api.TensorFlowRole;
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager; import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
import org.yaml.snakeyaml.introspector.Property; import org.yaml.snakeyaml.introspector.Property;
import org.yaml.snakeyaml.introspector.PropertyUtils; import org.yaml.snakeyaml.introspector.PropertyUtils;
@ -271,8 +271,7 @@ public abstract class RunJobParameters extends RunParameters {
throw new ParseException( throw new ParseException(
"--" + CliConstants.WORKER_RES + " is absent."); "--" + CliConstants.WORKER_RES + " is absent.");
} }
return ResourceUtils.createResourceFromString(workerResourceStr, return ResourceUtils.createResourceFromString(workerResourceStr);
clientContext.getOrCreateYarnClient().getResourceTypeInfo());
} }
return null; return null;
} }

View File

@ -27,7 +27,7 @@ import org.apache.hadoop.yarn.submarine.client.cli.param.ParametersHolder;
import org.apache.hadoop.yarn.submarine.client.cli.runjob.RoleParameters; import org.apache.hadoop.yarn.submarine.client.cli.runjob.RoleParameters;
import org.apache.hadoop.yarn.submarine.common.ClientContext; import org.apache.hadoop.yarn.submarine.common.ClientContext;
import org.apache.hadoop.yarn.submarine.common.api.TensorFlowRole; import org.apache.hadoop.yarn.submarine.common.api.TensorFlowRole;
import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
import java.io.IOException; import java.io.IOException;
import java.util.List; import java.util.List;
@ -127,8 +127,7 @@ public class TensorFlowRunJobParameters extends RunJobParameters {
if (psResourceStr == null) { if (psResourceStr == null) {
throw new ParseException("--" + CliConstants.PS_RES + " is absent."); throw new ParseException("--" + CliConstants.PS_RES + " is absent.");
} }
return ResourceUtils.createResourceFromString(psResourceStr, return ResourceUtils.createResourceFromString(psResourceStr);
clientContext.getOrCreateYarnClient().getResourceTypeInfo());
} }
return null; return null;
} }
@ -151,9 +150,8 @@ public class TensorFlowRunJobParameters extends RunJobParameters {
if (tensorboardResourceStr == null || tensorboardResourceStr.isEmpty()) { if (tensorboardResourceStr == null || tensorboardResourceStr.isEmpty()) {
tensorboardResourceStr = CliConstants.TENSORBOARD_DEFAULT_RESOURCES; tensorboardResourceStr = CliConstants.TENSORBOARD_DEFAULT_RESOURCES;
} }
Resource tensorboardResource = Resource tensorboardResource = ResourceUtils.createResourceFromString(
ResourceUtils.createResourceFromString(tensorboardResourceStr, tensorboardResourceStr);
clientContext.getOrCreateYarnClient().getResourceTypeInfo());
String tensorboardDockerImage = String tensorboardDockerImage =
parametersHolder.getOptionValue(CliConstants.TENSORBOARD_DOCKER_IMAGE); parametersHolder.getOptionValue(CliConstants.TENSORBOARD_DOCKER_IMAGE);
return new RoleParameters(TensorFlowRole.TENSORBOARD, 1, null, return new RoleParameters(TensorFlowRole.TENSORBOARD, 1, null,

View File

@ -0,0 +1,332 @@
/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. See accompanying LICENSE file.
*/
package org.apache.hadoop.yarn.submarine.common.resource;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* This class implements some methods with the almost the same logic as
* org.apache.hadoop.yarn.util.resource.ResourceUtils of hadoop 3.3.
* If the hadoop dependencies are upgraded to 3.3, this class can be refactored
* with org.apache.hadoop.yarn.util.resource.ResourceUtils.
*/
public final class ResourceUtils {
private final static String RES_PATTERN = "^[^=]+=\\d+\\s?\\w*$";
private final static String SET_RESOURCE_VALUE_METHOD = "setResourceValue";
private final static String SET_MEMORY_SIZE_METHOD = "setMemorySize";
private final static String DEPRECATED_SET_MEMORY_SIZE_METHOD =
"setMemory";
private final static String GET_MEMORY_SIZE_METHOD = "getMemorySize";
private final static String DEPRECATED_GET_MEMORY_SIZE_METHOD =
"getMemory";
private final static String GET_RESOURCE_VALUE_METHOD = "getResourceValue";
private final static String GET_RESOURCE_TYPE_METHOD =
"getResourcesTypeInfo";
private final static String REINITIALIZE_RESOURCES_METHOD =
"reinitializeResources";
public static final String MEMORY_URI = "memory-mb";
public static final String VCORES_URI = "vcores";
public static final String GPU_URI = "yarn.io/gpu";
public static final String FPGA_URI = "yarn.io/fpga";
private static final Logger LOG =
LoggerFactory.getLogger(ResourceUtils.class);
private ResourceUtils() {}
public static Resource createResourceFromString(String resourceStr) {
Map<String, Long> typeToValue = parseResourcesString(resourceStr);
Resource resource = Resource.newInstance(0, 0);
for (Map.Entry<String, Long> entry : typeToValue.entrySet()) {
if(entry.getKey().equals(VCORES_URI)) {
resource.setVirtualCores(entry.getValue().intValue());
continue;
} else if (entry.getKey().equals(MEMORY_URI)) {
setMemorySize(resource, entry.getValue());
continue;
}
setResource(resource, entry.getKey(), entry.getValue().intValue());
}
return resource;
}
private static Map<String, Long> parseResourcesString(String resourcesStr) {
Map<String, Long> resources = new HashMap<>();
String[] pairs = resourcesStr.trim().split(",");
for (String resource : pairs) {
resource = resource.trim();
if (!resource.matches(RES_PATTERN)) {
throw new IllegalArgumentException("\"" + resource + "\" is not a "
+ "valid resource type/amount pair. "
+ "Please provide key=amount pairs separated by commas.");
}
String[] splits = resource.split("=");
String key = splits[0], value = splits[1];
String units = getUnits(value);
String valueWithoutUnit = value.substring(0,
value.length()- units.length()).trim();
long resourceValue = Long.parseLong(valueWithoutUnit);
// Convert commandline unit to standard YARN unit.
if (units.equals("M") || units.equals("m")) {
units = "Mi";
} else if (units.equals("G") || units.equals("g")) {
units = "Gi";
} else if (!units.isEmpty()){
throw new IllegalArgumentException("Acceptable units are M/G or empty");
}
// special handle memory-mb and memory
if (key.equals(MEMORY_URI)) {
if (!units.isEmpty()) {
resourceValue = UnitsConversionUtil.convert(units, "Mi",
resourceValue);
}
}
if (key.equals("memory")) {
key = MEMORY_URI;
resourceValue = UnitsConversionUtil.convert(units, "Mi",
resourceValue);
}
// special handle gpu
if (key.equals("gpu")) {
key = GPU_URI;
}
// special handle fpga
if (key.equals("fpga")) {
key = FPGA_URI;
}
resources.put(key, resourceValue);
}
return resources;
}
/**
* As hadoop 2.9.2 and lower don't support resources except cpu and memory.
* Use reflection to set GPU or other resources for compatibility with
* hadoop 2.9.2
*/
public static void setResource(Resource resource, String resourceName,
int resourceValue) {
try {
Method method = resource.getClass().getMethod(SET_RESOURCE_VALUE_METHOD,
String.class, long.class);
method.invoke(resource, resourceName, resourceValue);
} catch (NoSuchMethodException e) {
LOG.error("There is no '" + SET_RESOURCE_VALUE_METHOD + "' API in this" +
"version of YARN", e);
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
} catch (IllegalAccessException | InvocationTargetException e) {
LOG.error("Failed to invoke '" + SET_RESOURCE_VALUE_METHOD +
"' method to set GPU resources", e);
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
}
return;
}
public static void setMemorySize(Resource resource, Long memorySize) {
boolean useWithIntParameter = false;
// For hadoop 2.9.2 and above
try {
Method method = resource.getClass().getMethod(SET_MEMORY_SIZE_METHOD,
long.class);
method.setAccessible(true);
method.invoke(resource, memorySize);
} catch (NoSuchMethodException nsme) {
LOG.info("There is no '" + SET_MEMORY_SIZE_METHOD + "(long)' API in" +
" this version of YARN");
useWithIntParameter = true;
} catch (IllegalAccessException | InvocationTargetException e) {
LOG.error("Failed to invoke '" + SET_MEMORY_SIZE_METHOD +
"' method", e);
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
}
// For hadoop 2.7.3
if (useWithIntParameter) {
try {
LOG.info("Trying to use '" + DEPRECATED_SET_MEMORY_SIZE_METHOD +
"(int)' API for this version of YARN");
Method method = resource.getClass().getMethod(
DEPRECATED_SET_MEMORY_SIZE_METHOD, int.class);
method.invoke(resource, memorySize.intValue());
} catch (NoSuchMethodException e) {
LOG.error("There is no '" + DEPRECATED_SET_MEMORY_SIZE_METHOD +
"(int)' API in this version of YARN", e);
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
} catch (IllegalAccessException | InvocationTargetException e) {
LOG.error("Failed to invoke '" + DEPRECATED_SET_MEMORY_SIZE_METHOD +
"' method", e);
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
}
}
}
public static long getMemorySize(Resource resource) {
boolean useWithIntParameter = false;
long memory = 0;
// For hadoop 2.9.2 and above
try {
Method method = resource.getClass().getMethod(GET_MEMORY_SIZE_METHOD);
method.setAccessible(true);
memory = (long) method.invoke(resource);
} catch (NoSuchMethodException e) {
LOG.info("There is no '" + GET_MEMORY_SIZE_METHOD + "' API in" +
" this version of YARN");
useWithIntParameter = true;
} catch (IllegalAccessException | InvocationTargetException e) {
LOG.error("Failed to invoke '" + GET_MEMORY_SIZE_METHOD +
"' method", e);
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
}
// For hadoop 2.7.3
if (useWithIntParameter) {
try {
LOG.info("Trying to use '" + DEPRECATED_GET_MEMORY_SIZE_METHOD +
"' API for this version of YARN");
Method method = resource.getClass().getMethod(
DEPRECATED_GET_MEMORY_SIZE_METHOD);
method.setAccessible(true);
memory = ((Integer) method.invoke(resource)).longValue();
} catch (NoSuchMethodException e) {
LOG.error("There is no '" + DEPRECATED_GET_MEMORY_SIZE_METHOD +
"' API in this version of YARN", e);
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
} catch (IllegalAccessException | InvocationTargetException e) {
LOG.error("Failed to invoke '" + DEPRECATED_GET_MEMORY_SIZE_METHOD +
"' method", e);
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
}
}
return memory;
}
/**
* As hadoop 2.9.2 and lower don't support resources except cpu and memory.
* Use reflection to set GPU or other resources for compatibility with
* hadoop 2.9.2
*/
public static long getResourceValue(Resource resource, String resourceName) {
long resourceValue = 0;
try {
Method method = resource.getClass().getMethod(GET_RESOURCE_VALUE_METHOD,
String.class);
Object value = method.invoke(resource, resourceName);
resourceValue = (long) value;
} catch (NoSuchMethodException e) {
LOG.info("There is no '" + GET_RESOURCE_VALUE_METHOD + "' API in this" +
" version of YARN");
} catch (InvocationTargetException e) {
if (e.getTargetException().getClass().getName().equals(
"org.apache.hadoop.yarn.exceptions.ResourceNotFoundException")) {
LOG.info("Not found resource " + resourceName);
} else {
LOG.info("Failed to invoke '" + GET_RESOURCE_VALUE_METHOD + "'" +
" method to get resource " + resourceName);
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
}
} catch (IllegalAccessException | ClassCastException e) {
LOG.error("Failed to invoke '" + GET_RESOURCE_VALUE_METHOD +
"' method to get resource " + resourceName, e);
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
}
return resourceValue;
}
/**
* As hadoop 2.9.2 and lower don't support resources except cpu and memory.
* Use reflection to add GPU or other resources for compatibility with
* hadoop 2.9.2
*/
public static void configureResourceType(String resrouceName) {
Class resourceTypeInfo;
try{
resourceTypeInfo = Class.forName(
"org.apache.hadoop.yarn.api.records.ResourceTypeInfo");
Class resourceUtils = Class.forName(
"org.apache.hadoop.yarn.util.resource.ResourceUtils");
Method method = resourceUtils.getMethod(GET_RESOURCE_TYPE_METHOD);
Object resTypes = method.invoke(null);
Method resourceTypeInstance = resourceTypeInfo.getMethod("newInstance",
String.class, String.class);
Object resourceType = resourceTypeInstance.invoke(null, resrouceName, "");
((ArrayList)resTypes).add(resourceType);
Method reInitialMethod = resourceUtils.getMethod(
REINITIALIZE_RESOURCES_METHOD, List.class);
reInitialMethod.invoke(null, resTypes);
} catch (ClassNotFoundException e) {
LOG.info("There is no specified class API in this" +
" version of YARN");
LOG.info(e.getMessage());
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
} catch (NoSuchMethodException nsme) {
LOG.info("There is no '" + GET_RESOURCE_VALUE_METHOD + "' API in this" +
" version of YARN");
} catch (IllegalAccessException | InvocationTargetException e) {
LOG.info("Failed to invoke 'configureResourceType' method ", e);
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
}
}
private static String getUnits(String resourceValue) {
return parseResourceValue(resourceValue)[0];
}
/**
* Extract unit and actual value from resource value.
* @param resourceValue Value of the resource
* @return Array containing unit and value. [0]=unit, [1]=value
* @throws IllegalArgumentException if units contain non alpha characters
*/
private static String[] parseResourceValue(String resourceValue) {
String[] resource = new String[2];
int i = 0;
for (; i < resourceValue.length(); i++) {
if (Character.isAlphabetic(resourceValue.charAt(i))) {
break;
}
}
String units = resourceValue.substring(i);
if (StringUtils.isAlpha(units) || units.equals("")) {
resource[0] = units;
resource[1] = resourceValue.substring(0, i);
return resource;
} else {
throw new IllegalArgumentException("Units '" + units + "'"
+ " contains non alphabet characters, which is not allowed.");
}
}
}

View File

@ -0,0 +1,164 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.submarine.common.resource;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* Almost the same logic as UnitsConversionUtil[YARN-4081]. If the dependencies
* are upgraded to hadoop 3.*, this class can be replaced.
*/
public final class UnitsConversionUtil {
private UnitsConversionUtil() {}
/**
* Helper class for encapsulating conversion values.
*/
public static class Converter {
private long numerator;
private long denominator;
Converter(long n, long d) {
this.numerator = n;
this.denominator = d;
}
}
private static final String[] UNITS = {"p", "n", "u", "m", "", "k", "M", "G",
"T", "P", "Ki", "Mi", "Gi", "Ti", "Pi"};
private static final List<String> SORTED_UNITS = Arrays.asList(UNITS);
public static final Set<String> KNOWN_UNITS = createKnownUnitsSet();
private static final Converter PICO =
new Converter(1L, 1000L * 1000L * 1000L * 1000L);
private static final Converter NANO =
new Converter(1L, 1000L * 1000L * 1000L);
private static final Converter MICRO = new Converter(1L, 1000L * 1000L);
private static final Converter MILLI = new Converter(1L, 1000L);
private static final Converter BASE = new Converter(1L, 1L);
private static final Converter KILO = new Converter(1000L, 1L);
private static final Converter MEGA = new Converter(1000L * 1000L, 1L);
private static final Converter GIGA =
new Converter(1000L * 1000L * 1000L, 1L);
private static final Converter TERA =
new Converter(1000L * 1000L * 1000L * 1000L, 1L);
private static final Converter PETA =
new Converter(1000L * 1000L * 1000L * 1000L * 1000L, 1L);
private static final Converter KILO_BINARY = new Converter(1024L, 1L);
private static final Converter MEGA_BINARY = new Converter(1024L * 1024L, 1L);
private static final Converter GIGA_BINARY =
new Converter(1024L * 1024L * 1024L, 1L);
private static final Converter TERA_BINARY =
new Converter(1024L * 1024L * 1024L * 1024L, 1L);
private static final Converter PETA_BINARY =
new Converter(1024L * 1024L * 1024L * 1024L * 1024L, 1L);
private static Set<String> createKnownUnitsSet() {
Set<String> ret = new HashSet<>();
ret.addAll(Arrays.asList(UNITS));
return ret;
}
private static Converter getConverter(String unit) {
switch (unit) {
case "p":
return PICO;
case "n":
return NANO;
case "u":
return MICRO;
case "m":
return MILLI;
case "":
return BASE;
case "k":
return KILO;
case "M":
return MEGA;
case "G":
return GIGA;
case "T":
return TERA;
case "P":
return PETA;
case "Ki":
return KILO_BINARY;
case "Mi":
return MEGA_BINARY;
case "Gi":
return GIGA_BINARY;
case "Ti":
return TERA_BINARY;
case "Pi":
return PETA_BINARY;
default:
throw new IllegalArgumentException(
"Unknown unit '" + unit + "'. Known units are " + KNOWN_UNITS);
}
}
/**
* Converts a value from one unit to another. Supported units can be obtained
* by inspecting the KNOWN_UNITS set.
*
* @param fromUnit the unit of the from value
* @param toUnit the target unit
* @param fromValue the value you wish to convert
* @return the value in toUnit
*/
public static long convert(String fromUnit, String toUnit, long fromValue) {
if (toUnit == null || fromUnit == null) {
throw new IllegalArgumentException("One or more arguments are null");
}
if (fromUnit.equals(toUnit)) {
return fromValue;
}
Converter fc = getConverter(fromUnit);
Converter tc = getConverter(toUnit);
long numerator = fc.numerator * tc.denominator;
long denominator = fc.denominator * tc.numerator;
long numeratorMultiplierLimit = Long.MAX_VALUE / numerator;
if (numerator < denominator) {
if (numeratorMultiplierLimit < fromValue) {
String overflowMsg =
"Converting " + fromValue + " from '" + fromUnit + "' to '" + toUnit
+ "' will result in an overflow of Long";
throw new IllegalArgumentException(overflowMsg);
}
return (fromValue * numerator) / denominator;
}
if (numeratorMultiplierLimit > fromValue) {
return (numerator * fromValue) / denominator;
}
long tmp = numerator / denominator;
if ((Long.MAX_VALUE / tmp) < fromValue) {
String overflowMsg =
"Converting " + fromValue + " from '" + fromUnit + "' to '" + toUnit
+ "' will result in an overflow of Long";
throw new IllegalArgumentException(overflowMsg);
}
return fromValue * tmp;
}
}

View File

@ -0,0 +1,19 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This package contains resource utility classes.
*/
package org.apache.hadoop.yarn.submarine.common.resource;

View File

@ -164,6 +164,22 @@ See below screenshot:
![alt text](./images/tensorboard-service.png "Tensorboard service") ![alt text](./images/tensorboard-service.png "Tensorboard service")
If there is no hadoop client, we can also use the java command and the uber jar, hadoop-submarine-all-*.jar, to submit the job.
```
java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \
org.apache.hadoop.yarn.submarine.client.cli.Cli job run \
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name tf-job-001 \
--docker_image <your-docker-image> \
--input_path hdfs://default/dataset/cifar-10-data \
--checkpoint_path hdfs://default/tmp/cifar-10-jobdir \
--worker_resources memory=4G,vcores=2,gpu=2 \
--worker_launch_cmd "python ... (Your training application cmd)" \
--tensorboard # this will launch a companion tensorboard container for monitoring
```
### Launch Distributed Tensorflow Application: ### Launch Distributed Tensorflow Application:
#### Commandline #### Commandline
@ -181,6 +197,20 @@ yarn jar hadoop-yarn-applications-submarine-<version>.jar job run \
--num_ps 2 \ --num_ps 2 \
--ps_resources memory=4G,vcores=2,gpu=0 --ps_launch_cmd "cmd for ps" \ --ps_resources memory=4G,vcores=2,gpu=0 --ps_launch_cmd "cmd for ps" \
``` ```
Or
```
java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \
org.apache.hadoop.yarn.submarine.client.cli.Cli job run \
--name tf-job-001 --docker_image <your docker image> \
--input_path hdfs://default/dataset/cifar-10-data \
--checkpoint_path hdfs://default/tmp/cifar-10-jobdir \
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \
--num_workers 2 \
--worker_resources memory=8G,vcores=2,gpu=1 --worker_launch_cmd "cmd for worker ..." \
--num_ps 2 \
--ps_resources memory=4G,vcores=2,gpu=0 --ps_launch_cmd "cmd for ps" \
```
#### Notes: #### Notes:
@ -197,7 +227,11 @@ yarn jar hadoop-yarn-applications-submarine-<version>.jar job run \
``` ```
yarn jar hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job show --name tf-job-001 yarn jar hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job show --name tf-job-001
``` ```
Or
```
java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \
org.apache.hadoop.yarn.submarine.client.cli.Cli job show --name tf-job-001
```
Output looks like: Output looks like:
``` ```
Job Meta Info: Job Meta Info:
@ -222,6 +256,17 @@ yarn jar /tmp/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current \ --env DOCKER_HADOOP_HDFS_HOME=/hadoop-current \
--num_workers 0 --tensorboard --num_workers 0 --tensorboard
``` ```
Or
```
# Cleanup previous service if needed
yarn app -destroy tensorboard-service; \
java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \
org.apache.hadoop.yarn.submarine.client.cli.Cli job run \
--name tensorboard-service --verbose --docker_image wtan/tf-1.8.0-cpu:0.0.3 \
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \
--num_workers 0 --tensorboard
```
You can view multiple job training history like from the `Tensorboard` link: You can view multiple job training history like from the `Tensorboard` link:
@ -243,3 +288,17 @@ If you want to build the Submarine project by yourself, you can follow the steps
- Run 'mvn install -DskipTests' from Hadoop source top level once. - Run 'mvn install -DskipTests' from Hadoop source top level once.
- Navigate to hadoop-submarine folder and run 'mvn clean package'. - Navigate to hadoop-submarine folder and run 'mvn clean package'.
- By Default, hadoop-submarine is built based on hadoop 3.1.2 dependencies.
Both yarn service runtime and tony runtime are built.
You can also add a parameter of "-Phadoop-3.2" to specify the dependencies
to hadoop 3.2.0.
- Hadoop-submarine can support hadoop 2.9.2 and hadoop 2.7.4 as well.
You can add "-Phadoop-2.9" to build submarine based on hadoop 2.9.2.
For example:
```
mvn clean package -Phadoop-2.9
```
As yarn service is based on hadoop 3.*, so only tony runtime is built
in this case.

View File

@ -16,15 +16,14 @@
package org.apache.hadoop.yarn.submarine.client.cli.runjob; package org.apache.hadoop.yarn.submarine.client.cli.runjob;
import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.api.records.ResourceTypeInfo;
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils; import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils;
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters; import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters;
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters; import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlParseException; import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlParseException;
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs; import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException;
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
import org.junit.After; import org.junit.After;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
@ -32,10 +31,10 @@ import org.junit.BeforeClass;
import org.junit.Rule; import org.junit.Rule;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.ExpectedException; import org.junit.rules.ExpectedException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File; import java.io.File;
import java.util.ArrayList;
import java.util.List;
import static org.apache.hadoop.yarn.submarine.client.cli.runjob.TestRunJobCliParsingCommon.getMockClientContext; import static org.apache.hadoop.yarn.submarine.client.cli.runjob.TestRunJobCliParsingCommon.getMockClientContext;
import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertFalse;
@ -49,6 +48,8 @@ public class TestRunJobCliParsingCommonYaml {
private static final String DIR_NAME = "runjob-common-yaml"; private static final String DIR_NAME = "runjob-common-yaml";
private static final String TF_DIR = "runjob-pytorch-yaml"; private static final String TF_DIR = "runjob-pytorch-yaml";
private File yamlConfig; private File yamlConfig;
private static Logger LOG = LoggerFactory.getLogger(
TestRunJobCliParsingCommonYaml.class);
@Before @Before
public void before() { public void before() {
@ -62,10 +63,12 @@ public class TestRunJobCliParsingCommonYaml {
@BeforeClass @BeforeClass
public static void configureResourceTypes() { public static void configureResourceTypes() {
List<ResourceTypeInfo> resTypes = new ArrayList<>( try {
ResourceUtils.getResourcesTypeInfo()); ResourceUtils.configureResourceType(ResourceUtils.GPU_URI);
resTypes.add(ResourceTypeInfo.newInstance(ResourceInformation.GPU_URI, "")); } catch (SubmarineRuntimeException e) {
ResourceUtils.reinitializeResources(resTypes); LOG.info("The hadoop dependency doesn't support gpu resource, " +
"so just skip this test case.");
}
} }
@Rule @Rule

View File

@ -24,29 +24,28 @@ import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import java.io.File; import java.io.File;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.hadoop.yarn.api.records.ResourceInformation; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceTypeInfo;
import org.apache.hadoop.yarn.resourcetypes.ResourceTypesTestHelper;
import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils; import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils;
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.PyTorchRunJobParameters; import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.PyTorchRunJobParameters;
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters; import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters;
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlParseException; import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlParseException;
import org.apache.hadoop.yarn.submarine.client.cli.runjob.RunJobCli; import org.apache.hadoop.yarn.submarine.client.cli.runjob.RunJobCli;
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs; import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException;
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
import org.apache.hadoop.yarn.util.resource.Resources;
import org.junit.After; import org.junit.After;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Rule; import org.junit.Rule;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.ExpectedException; import org.junit.rules.ExpectedException;
import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap; import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/** /**
* Test class that verifies the correctness of PyTorch * Test class that verifies the correctness of PyTorch
@ -56,6 +55,8 @@ public class TestRunJobCliParsingPyTorchYaml {
private static final String OVERRIDDEN_PREFIX = "overridden_"; private static final String OVERRIDDEN_PREFIX = "overridden_";
private static final String DIR_NAME = "runjob-pytorch-yaml"; private static final String DIR_NAME = "runjob-pytorch-yaml";
private File yamlConfig; private File yamlConfig;
private static Logger LOG = LoggerFactory.getLogger(
TestRunJobCliParsingPyTorchYaml.class);
@Before @Before
public void before() { public void before() {
@ -67,14 +68,6 @@ public class TestRunJobCliParsingPyTorchYaml {
YamlConfigTestUtils.deleteFile(yamlConfig); YamlConfigTestUtils.deleteFile(yamlConfig);
} }
@BeforeClass
public static void configureResourceTypes() {
List<ResourceTypeInfo> resTypes = new ArrayList<>(
ResourceUtils.getResourcesTypeInfo());
resTypes.add(ResourceTypeInfo.newInstance(ResourceInformation.GPU_URI, ""));
ResourceUtils.reinitializeResources(resTypes);
}
@Rule @Rule
public ExpectedException exception = ExpectedException.none(); public ExpectedException exception = ExpectedException.none();
@ -105,23 +98,38 @@ public class TestRunJobCliParsingPyTorchYaml {
} }
} }
private void verifyWorkerValues(RunJobParameters jobRunParameters, private PyTorchRunJobParameters verifyWorkerCommonValues(RunJobParameters
String prefix) { jobRunParameters, String prefix) {
assertTrue(RunJobParameters.class + " must be an instance of " + assertTrue(RunJobParameters.class + " must be an instance of " +
PyTorchRunJobParameters.class, PyTorchRunJobParameters.class,
jobRunParameters instanceof PyTorchRunJobParameters); jobRunParameters instanceof PyTorchRunJobParameters);
PyTorchRunJobParameters tensorFlowParams = PyTorchRunJobParameters pyTorchParams =
(PyTorchRunJobParameters) jobRunParameters; (PyTorchRunJobParameters) jobRunParameters;
assertEquals(3, tensorFlowParams.getNumWorkers()); assertEquals(3, pyTorchParams.getNumWorkers());
assertEquals(prefix + "testLaunchCmdWorker", assertEquals(prefix + "testLaunchCmdWorker",
tensorFlowParams.getWorkerLaunchCmd()); pyTorchParams.getWorkerLaunchCmd());
assertEquals(prefix + "testDockerImageWorker", assertEquals(prefix + "testDockerImageWorker",
tensorFlowParams.getWorkerDockerImage()); pyTorchParams.getWorkerDockerImage());
assertEquals(ResourceTypesTestHelper.newResource(20480L, 32, return pyTorchParams;
ImmutableMap.<String, String> builder() }
.put(ResourceInformation.GPU_URI, "2").build()),
tensorFlowParams.getWorkerResource()); private void verifyWorkerValues(RunJobParameters jobRunParameters,
String prefix) {
PyTorchRunJobParameters pyTorchParams = verifyWorkerCommonValues
(jobRunParameters, prefix);
assertEquals(Resources.createResource(20480, 32),
pyTorchParams.getWorkerResource());
}
private void verifyWorkerValuesWithGpu(RunJobParameters jobRunParameters,
String prefix) {
PyTorchRunJobParameters pyTorchParams = verifyWorkerCommonValues
(jobRunParameters, prefix);
Resource workResource = Resources.createResource(20480, 32);
ResourceUtils.setResource(workResource, ResourceUtils.GPU_URI, 2);
assertEquals(workResource, pyTorchParams.getWorkerResource());
} }
private void verifySecurityValues(RunJobParameters jobRunParameters) { private void verifySecurityValues(RunJobParameters jobRunParameters) {
@ -146,6 +154,30 @@ public class TestRunJobCliParsingPyTorchYaml {
verifySecurityValues(jobRunParameters); verifySecurityValues(jobRunParameters);
} }
@Test
public void testValidGpuYamlParsing() throws Exception {
try {
ResourceUtils.configureResourceType(ResourceUtils.GPU_URI);
} catch (SubmarineRuntimeException e) {
LOG.info("The hadoop dependency doesn't support gpu resource, " +
"so just skip this test case.");
return;
}
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
Assert.assertFalse(SubmarineLogs.isVerbose());
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
DIR_NAME + "/valid-gpu-config.yaml");
runJobCli.run(
new String[] {"-f", yamlConfig.getAbsolutePath(), "--verbose"});
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
verifyBasicConfigValues(jobRunParameters);
verifyWorkerValuesWithGpu(jobRunParameters, "");
verifySecurityValues(jobRunParameters);
}
@Test @Test
public void testRoleOverrides() throws Exception { public void testRoleOverrides() throws Exception {
RunJobCli runJobCli = new RunJobCli(getMockClientContext()); RunJobCli runJobCli = new RunJobCli(getMockClientContext());

View File

@ -18,26 +18,25 @@
package org.apache.hadoop.yarn.submarine.client.cli.runjob.tensorflow; package org.apache.hadoop.yarn.submarine.client.cli.runjob.tensorflow;
import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.api.records.ResourceTypeInfo;
import org.apache.hadoop.yarn.resourcetypes.ResourceTypesTestHelper;
import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils; import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils;
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters; import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters;
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters; import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
import org.apache.hadoop.yarn.submarine.client.cli.runjob.RunJobCli; import org.apache.hadoop.yarn.submarine.client.cli.runjob.RunJobCli;
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs; import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException;
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
import org.apache.hadoop.yarn.util.resource.Resources;
import org.junit.After; import org.junit.After;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Rule; import org.junit.Rule;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.ExpectedException; import org.junit.rules.ExpectedException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File; import java.io.File;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import static org.apache.hadoop.yarn.submarine.client.cli.runjob.TestRunJobCliParsingCommon.getMockClientContext; import static org.apache.hadoop.yarn.submarine.client.cli.runjob.TestRunJobCliParsingCommon.getMockClientContext;
@ -53,6 +52,8 @@ public class TestRunJobCliParsingTensorFlowYaml {
private static final String OVERRIDDEN_PREFIX = "overridden_"; private static final String OVERRIDDEN_PREFIX = "overridden_";
private static final String DIR_NAME = "runjob-tensorflow-yaml"; private static final String DIR_NAME = "runjob-tensorflow-yaml";
private File yamlConfig; private File yamlConfig;
private static Logger LOG = LoggerFactory.getLogger(
TestRunJobCliParsingTensorFlowYaml.class);
@Before @Before
public void before() { public void before() {
@ -64,14 +65,6 @@ public class TestRunJobCliParsingTensorFlowYaml {
YamlConfigTestUtils.deleteFile(yamlConfig); YamlConfigTestUtils.deleteFile(yamlConfig);
} }
@BeforeClass
public static void configureResourceTypes() {
List<ResourceTypeInfo> resTypes = new ArrayList<>(
ResourceUtils.getResourcesTypeInfo());
resTypes.add(ResourceTypeInfo.newInstance(ResourceInformation.GPU_URI, ""));
ResourceUtils.reinitializeResources(resTypes);
}
@Rule @Rule
public ExpectedException exception = ExpectedException.none(); public ExpectedException exception = ExpectedException.none();
@ -114,14 +107,12 @@ public class TestRunJobCliParsingTensorFlowYaml {
assertEquals(prefix + "testLaunchCmdPs", tensorFlowParams.getPSLaunchCmd()); assertEquals(prefix + "testLaunchCmdPs", tensorFlowParams.getPSLaunchCmd());
assertEquals(prefix + "testDockerImagePs", assertEquals(prefix + "testDockerImagePs",
tensorFlowParams.getPsDockerImage()); tensorFlowParams.getPsDockerImage());
assertEquals(ResourceTypesTestHelper.newResource(20500L, 34, assertEquals(Resources.createResource(20500, 34),
ImmutableMap.<String, String> builder()
.put(ResourceInformation.GPU_URI, "4").build()),
tensorFlowParams.getPsResource()); tensorFlowParams.getPsResource());
} }
private void verifyWorkerValues(RunJobParameters jobRunParameters, private TensorFlowRunJobParameters verifyWorkerCommonValues(
String prefix) { RunJobParameters jobRunParameters, String prefix) {
assertTrue(RunJobParameters.class + " must be an instance of " + assertTrue(RunJobParameters.class + " must be an instance of " +
TensorFlowRunJobParameters.class, TensorFlowRunJobParameters.class,
jobRunParameters instanceof TensorFlowRunJobParameters); jobRunParameters instanceof TensorFlowRunJobParameters);
@ -133,12 +124,26 @@ public class TestRunJobCliParsingTensorFlowYaml {
tensorFlowParams.getWorkerLaunchCmd()); tensorFlowParams.getWorkerLaunchCmd());
assertEquals(prefix + "testDockerImageWorker", assertEquals(prefix + "testDockerImageWorker",
tensorFlowParams.getWorkerDockerImage()); tensorFlowParams.getWorkerDockerImage());
assertEquals(ResourceTypesTestHelper.newResource(20480L, 32, return tensorFlowParams;
ImmutableMap.<String, String> builder() }
.put(ResourceInformation.GPU_URI, "2").build()),
private void verifyWorkerValues(RunJobParameters jobRunParameters,
String prefix) {
TensorFlowRunJobParameters tensorFlowParams = verifyWorkerCommonValues
(jobRunParameters, prefix);
assertEquals(Resources.createResource(20480, 32),
tensorFlowParams.getWorkerResource()); tensorFlowParams.getWorkerResource());
} }
private void verifyWorkerValuesWithGpu(RunJobParameters jobRunParameters,
String prefix) {
TensorFlowRunJobParameters tensorFlowParams = verifyWorkerCommonValues
(jobRunParameters, prefix);
Resource workResource = Resources.createResource(20480, 32);
ResourceUtils.setResource(workResource, ResourceUtils.GPU_URI, 2);
assertEquals(workResource, tensorFlowParams.getWorkerResource());
}
private void verifySecurityValues(RunJobParameters jobRunParameters) { private void verifySecurityValues(RunJobParameters jobRunParameters) {
assertEquals("keytabPath", jobRunParameters.getKeytab()); assertEquals("keytabPath", jobRunParameters.getKeytab());
assertEquals("testPrincipal", jobRunParameters.getPrincipal()); assertEquals("testPrincipal", jobRunParameters.getPrincipal());
@ -155,9 +160,7 @@ public class TestRunJobCliParsingTensorFlowYaml {
assertTrue(tensorFlowParams.isTensorboardEnabled()); assertTrue(tensorFlowParams.isTensorboardEnabled());
assertEquals("tensorboardDockerImage", assertEquals("tensorboardDockerImage",
tensorFlowParams.getTensorboardDockerImage()); tensorFlowParams.getTensorboardDockerImage());
assertEquals(ResourceTypesTestHelper.newResource(21000L, 37, assertEquals(Resources.createResource(21000, 37),
ImmutableMap.<String, String> builder()
.put(ResourceInformation.GPU_URI, "3").build()),
tensorFlowParams.getTensorboardResource()); tensorFlowParams.getTensorboardResource());
} }
@ -179,6 +182,32 @@ public class TestRunJobCliParsingTensorFlowYaml {
verifyTensorboardValues(jobRunParameters); verifyTensorboardValues(jobRunParameters);
} }
@Test
public void testValidGpuYamlParsing() throws Exception {
try {
ResourceUtils.configureResourceType(ResourceUtils.GPU_URI);
} catch (SubmarineRuntimeException e) {
LOG.info("The hadoop dependency doesn't support gpu resource, " +
"so just skip this test case.");
return;
}
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
Assert.assertFalse(SubmarineLogs.isVerbose());
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
DIR_NAME + "/valid-gpu-config.yaml");
runJobCli.run(
new String[] {"-f", yamlConfig.getAbsolutePath(), "--verbose"});
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
verifyBasicConfigValues(jobRunParameters);
verifyPsValues(jobRunParameters, "");
verifyWorkerValuesWithGpu(jobRunParameters, "");
verifySecurityValues(jobRunParameters);
verifyTensorboardValues(jobRunParameters);
}
@Test @Test
public void testRoleOverrides() throws Exception { public void testRoleOverrides() throws Exception {
RunJobCli runJobCli = new RunJobCli(getMockClientContext()); RunJobCli runJobCli = new RunJobCli(getMockClientContext());
@ -240,9 +269,7 @@ public class TestRunJobCliParsingTensorFlowYaml {
assertTrue(tensorFlowParams.isTensorboardEnabled()); assertTrue(tensorFlowParams.isTensorboardEnabled());
assertNull("tensorboardDockerImage should be null!", assertNull("tensorboardDockerImage should be null!",
tensorFlowParams.getTensorboardDockerImage()); tensorFlowParams.getTensorboardDockerImage());
assertEquals(ResourceTypesTestHelper.newResource(21000L, 37, assertEquals(Resources.createResource(21000, 37),
ImmutableMap.<String, String> builder()
.put(ResourceInformation.GPU_URI, "3").build()),
tensorFlowParams.getTensorboardResource()); tensorFlowParams.getTensorboardResource());
} }

View File

@ -132,14 +132,14 @@ public class TestRunJobCliParsingTensorFlowYamlStandalone {
private void assertWorkerValues(Role worker) { private void assertWorkerValues(Role worker) {
assertEquals("testLaunchCmdWorker", worker.getLaunchCmd()); assertEquals("testLaunchCmdWorker", worker.getLaunchCmd());
assertEquals("testDockerImageWorker", worker.getDockerImage()); assertEquals("testDockerImageWorker", worker.getDockerImage());
assertEquals("memory=20480M,vcores=32,gpu=2", worker.getResources()); assertEquals("memory=20480M,vcores=32", worker.getResources());
assertEquals(3, worker.getReplicas()); assertEquals(3, worker.getReplicas());
} }
private void assertPsValues(Role ps) { private void assertPsValues(Role ps) {
assertEquals("testLaunchCmdPs", ps.getLaunchCmd()); assertEquals("testLaunchCmdPs", ps.getLaunchCmd());
assertEquals("testDockerImagePs", ps.getDockerImage()); assertEquals("testDockerImagePs", ps.getDockerImage());
assertEquals("memory=20500M,vcores=34,gpu=4", ps.getResources()); assertEquals("memory=20500M,vcores=34", ps.getResources());
assertEquals(4, ps.getReplicas()); assertEquals(4, ps.getReplicas());
} }
@ -161,7 +161,7 @@ public class TestRunJobCliParsingTensorFlowYamlStandalone {
TensorBoard tensorBoard = yamlConfigFile.getTensorBoard(); TensorBoard tensorBoard = yamlConfigFile.getTensorBoard();
assertNotNull("Tensorboard should not be null!", tensorBoard); assertNotNull("Tensorboard should not be null!", tensorBoard);
assertEquals("tensorboardDockerImage", tensorBoard.getDockerImage()); assertEquals("tensorboardDockerImage", tensorBoard.getDockerImage());
assertEquals("memory=21000M,vcores=37,gpu=3", tensorBoard.getResources()); assertEquals("memory=21000M,vcores=37", tensorBoard.getResources());
} }
@Before @Before

View File

@ -20,15 +20,6 @@ package org.apache.hadoop.yarn.submarine.common;
import org.apache.hadoop.yarn.submarine.common.fs.MockRemoteDirectoryManager; import org.apache.hadoop.yarn.submarine.common.fs.MockRemoteDirectoryManager;
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager; import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
import org.apache.hadoop.yarn.client.api.YarnClient;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import java.io.IOException;
import static org.junit.Assert.fail;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
public class MockClientContext extends ClientContext { public class MockClientContext extends ClientContext {
@ -44,18 +35,4 @@ public class MockClientContext extends ClientContext {
RemoteDirectoryManager remoteDirectoryMgr) { RemoteDirectoryManager remoteDirectoryMgr) {
this.remoteDirectoryMgr = remoteDirectoryMgr; this.remoteDirectoryMgr = remoteDirectoryMgr;
} }
@Override
public synchronized YarnClient getOrCreateYarnClient() {
YarnClient client = mock(YarnClient.class);
try {
when(client.getResourceTypeInfo()).thenReturn(
ResourceUtils.getResourcesTypeInfo());
} catch (YarnException e) {
fail(e.getMessage());
} catch (IOException e) {
fail(e.getMessage());
}
return client;
}
} }

View File

@ -43,12 +43,12 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
docker_image: testDockerImageWorker docker_image: testDockerImageWorker
ps: ps:
resources: memory=20500M,vcores=34,gpu=4 resources: memory=20500M,vcores=34
replicas: 4 replicas: 4
launch_cmd: testLaunchCmdPs launch_cmd: testLaunchCmdPs
docker_image: testDockerImagePs docker_image: testDockerImagePs
@ -59,5 +59,5 @@ security:
distribute_keytab: true distribute_keytab: true
tensorBoard: tensorBoard:
resources: memory=21000M,vcores=37,gpu=3 resources: memory=21000M,vcores=37
docker_image: tensorboardDockerImage docker_image: tensorboardDockerImage

View File

@ -23,11 +23,11 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
ps: ps:
resources: memory=20500M,vcores=34,gpu=4 resources: memory=20500M,vcores=34
replicas: 4 replicas: 4
launch_cmd: testLaunchCmdPs launch_cmd: testLaunchCmdPs
@ -37,5 +37,5 @@ security:
distribute_keytab: true distribute_keytab: true
tensorBoard: tensorBoard:
resources: memory=21000M,vcores=37,gpu=3 resources: memory=21000M,vcores=37
docker_image: tensorboardDockerImage docker_image: tensorboardDockerImage

View File

@ -42,12 +42,12 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
docker_image: testDockerImageWorker docker_image: testDockerImageWorker
ps: ps:
resources: memory=20500M,vcores=34,gpu=4 resources: memory=20500M,vcores=34
replicas: 4 replicas: 4
launch_cmd: testLaunchCmdPs launch_cmd: testLaunchCmdPs
docker_image: testDockerImagePs docker_image: testDockerImagePs
@ -58,5 +58,5 @@ security:
distribute_keytab: true distribute_keytab: true
tensorBoard: tensorBoard:
resources: memory=21000M,vcores=37,gpu=3 resources: memory=21000M,vcores=37
docker_image: tensorboardDockerImage docker_image: tensorboardDockerImage

View File

@ -40,10 +40,10 @@ configs:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
ps: ps:
resources: memory=20500M,vcores=34,gpu=4 resources: memory=20500M,vcores=34
replicas: 4 replicas: 4
launch_cmd: testLaunchCmdPs launch_cmd: testLaunchCmdPs

View File

@ -43,11 +43,11 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
ps: ps:
resources: memory=20500M,vcores=34,gpu=4 resources: memory=20500M,vcores=34
replicas: 4 replicas: 4
launch_cmd: testLaunchCmdPs launch_cmd: testLaunchCmdPs

View File

@ -42,11 +42,11 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
ps: ps:
resources: memory=20500M,vcores=34,gpu=4 resources: memory=20500M,vcores=34
replicas: 4 replicas: 4
launch_cmd: testLaunchCmdPs launch_cmd: testLaunchCmdPs
@ -56,5 +56,5 @@ security:
distribute_keytab: true distribute_keytab: true
tensorBoard: tensorBoard:
resources: memory=21000M,vcores=37,gpu=3 resources: memory=21000M,vcores=37
docker_image: tensorboardDockerImage docker_image: tensorboardDockerImage

View File

@ -42,11 +42,11 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
ps: ps:
resources: memory=20500M,vcores=34,gpu=4 resources: memory=20500M,vcores=34
replicas: 4 replicas: 4
launch_cmd: testLaunchCmdPs launch_cmd: testLaunchCmdPs
@ -56,5 +56,5 @@ security:
distribute_keytab: true distribute_keytab: true
tensorBoard: tensorBoard:
resources: memory=21000M,vcores=37,gpu=3 resources: memory=21000M,vcores=37
docker_image: tensorboardDockerImage docker_image: tensorboardDockerImage

View File

@ -40,7 +40,7 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
docker_image: testDockerImageWorker docker_image: testDockerImageWorker

View File

@ -43,7 +43,7 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
docker_image: testDockerImageWorker docker_image: testDockerImageWorker

View File

@ -43,7 +43,7 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
docker_image: testDockerImageWorker docker_image: testDockerImageWorker

View File

@ -43,7 +43,7 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
docker_image: testDockerImageWorker docker_image: testDockerImageWorker

View File

@ -43,7 +43,7 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: overridden_testLaunchCmdWorker launch_cmd: overridden_testLaunchCmdWorker
docker_image: overridden_testDockerImageWorker docker_image: overridden_testDockerImageWorker

View File

@ -43,7 +43,7 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
docker_image: testDockerImageWorker docker_image: testDockerImageWorker

View File

@ -0,0 +1,54 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
spec:
name: testJobName
job_type: testJobType
framework: pytorch
configs:
input_path: testInputPath
checkpoint_path: testCheckpointPath
saved_model_path: testSavedModelPath
docker_image: testDockerImage
wait_job_finish: true
envs:
env1: 'env1Value'
env2: 'env2Value'
localizations:
- hdfs://remote-file1:/local-filename1:rw
- nfs://remote-file2:/local-filename2:rw
mounts:
- /etc/passwd:/etc/passwd:rw
- /etc/hosts:/etc/hosts:rw
quicklinks:
- Notebook_UI=https://master-0:7070
- Notebook_UI2=https://master-0:7071
scheduling:
queue: queue1
roles:
worker:
resources: memory=20480M,vcores=32,gpu=2
replicas: 3
launch_cmd: testLaunchCmdWorker
docker_image: testDockerImageWorker
security:
keytab: keytabPath
principal: testPrincipal
distribute_keytab: true

View File

@ -40,12 +40,12 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
docker_image: testDockerImageWorker docker_image: testDockerImageWorker
ps: ps:
resources: memory=20500M,vcores=34,gpu=4 resources: memory=20500M,vcores=34
replicas: 4 replicas: 4
launch_cmd: testLaunchCmdPs launch_cmd: testLaunchCmdPs
docker_image: testDockerImagePs docker_image: testDockerImagePs
@ -56,5 +56,5 @@ security:
distribute_keytab: true distribute_keytab: true
tensorBoard: tensorBoard:
resources: memory=21000M,vcores=37,gpu=3 resources: memory=21000M,vcores=37
docker_image: tensorboardDockerImage docker_image: tensorboardDockerImage

View File

@ -43,12 +43,12 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
docker_image: testDockerImageWorker docker_image: testDockerImageWorker
ps: ps:
resources: memory=20500M,vcores=34,gpu=4 resources: memory=20500M,vcores=34
replicas: 4 replicas: 4
launch_cmd: testLaunchCmdPs launch_cmd: testLaunchCmdPs
docker_image: testDockerImagePs docker_image: testDockerImagePs
@ -58,5 +58,5 @@ security:
distribute_keytab: true distribute_keytab: true
tensorBoard: tensorBoard:
resources: memory=21000M,vcores=37,gpu=3 resources: memory=21000M,vcores=37
docker_image: tensorboardDockerImage docker_image: tensorboardDockerImage

View File

@ -43,12 +43,12 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
docker_image: testDockerImageWorker docker_image: testDockerImageWorker
ps: ps:
resources: memory=20500M,vcores=34,gpu=4 resources: memory=20500M,vcores=34
replicas: 4 replicas: 4
launch_cmd: testLaunchCmdPs launch_cmd: testLaunchCmdPs
docker_image: testDockerImagePs docker_image: testDockerImagePs
@ -59,4 +59,4 @@ security:
distribute_keytab: true distribute_keytab: true
tensorBoard: tensorBoard:
resources: memory=21000M,vcores=37,gpu=3 resources: memory=21000M,vcores=37

View File

@ -43,7 +43,7 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: overridden_testLaunchCmdWorker launch_cmd: overridden_testLaunchCmdWorker
docker_image: overridden_testDockerImageWorker docker_image: overridden_testDockerImageWorker
@ -58,7 +58,7 @@ roles:
- /etc/hosts:/overridden_Worker - /etc/hosts:/overridden_Worker
ps: ps:
resources: memory=20500M,vcores=34,gpu=4 resources: memory=20500M,vcores=34
replicas: 4 replicas: 4
launch_cmd: overridden_testLaunchCmdPs launch_cmd: overridden_testLaunchCmdPs
docker_image: overridden_testDockerImagePs docker_image: overridden_testDockerImagePs
@ -78,5 +78,5 @@ security:
distribute_keytab: true distribute_keytab: true
tensorBoard: tensorBoard:
resources: memory=21000M,vcores=37,gpu=3 resources: memory=21000M,vcores=37
docker_image: tensorboardDockerImage docker_image: tensorboardDockerImage

View File

@ -43,12 +43,12 @@ scheduling:
roles: roles:
worker: worker:
resources: memory=20480M,vcores=32,gpu=2 resources: memory=20480M,vcores=32
replicas: 3 replicas: 3
launch_cmd: testLaunchCmdWorker launch_cmd: testLaunchCmdWorker
docker_image: testDockerImageWorker docker_image: testDockerImageWorker
ps: ps:
resources: memory=20500M,vcores=34,gpu=4 resources: memory=20500M,vcores=34
replicas: 4 replicas: 4
launch_cmd: testLaunchCmdPs launch_cmd: testLaunchCmdPs
docker_image: testDockerImagePs docker_image: testDockerImagePs
@ -59,5 +59,5 @@ security:
distribute_keytab: true distribute_keytab: true
tensorBoard: tensorBoard:
resources: memory=21000M,vcores=37,gpu=3 resources: memory=21000M,vcores=37
docker_image: tensorboardDockerImage docker_image: tensorboardDockerImage

View File

@ -0,0 +1,63 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
spec:
name: testJobName
job_type: testJobType
framework: tensorflow
configs:
input_path: testInputPath
checkpoint_path: testCheckpointPath
saved_model_path: testSavedModelPath
docker_image: testDockerImage
wait_job_finish: true
envs:
env1: 'env1Value'
env2: 'env2Value'
localizations:
- hdfs://remote-file1:/local-filename1:rw
- nfs://remote-file2:/local-filename2:rw
mounts:
- /etc/passwd:/etc/passwd:rw
- /etc/hosts:/etc/hosts:rw
quicklinks:
- Notebook_UI=https://master-0:7070
- Notebook_UI2=https://master-0:7071
scheduling:
queue: queue1
roles:
worker:
resources: memory=20480M,vcores=32,gpu=2
replicas: 3
launch_cmd: testLaunchCmdWorker
docker_image: testDockerImageWorker
ps:
resources: memory=20500M,vcores=34
replicas: 4
launch_cmd: testLaunchCmdPs
docker_image: testDockerImagePs
security:
keytab: keytabPath
principal: testPrincipal
distribute_keytab: true
tensorBoard:
resources: memory=21000M,vcores=37
docker_image: tensorboardDockerImage

View File

@ -0,0 +1,131 @@
<?xml version="1.0"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<artifactId>hadoop-submarine</artifactId>
<groupId>org.apache.hadoop</groupId>
<version>0.2.0-SNAPSHOT</version>
</parent>
<artifactId>${project.artifactId}</artifactId>
<version>${project.version}</version>
<name>Hadoop Submarine Dist</name>
<packaging>pom</packaging>
<properties>
<!-- Needed for generating FindBugs warnings using parent pom -->
<yarn.basedir>${project.parent.parent.basedir}</yarn.basedir>
<project.artifactId>hadoop-submarine-dist</project.artifactId>
<project.version>0.2.0-SNAPSHOT</project.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-submarine-core</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
<profiles>
<profile>
<id>hadoop-3.2</id>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-submarine-tony-runtime</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</profile>
<!-- Default profile-->
<profile>
<id>hadoop-3.1</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-submarine-tony-runtime</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</profile>
<profile>
<id>hadoop-2.9</id>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-submarine-tony-runtime</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</profile>
<profile>
<id>hadoop-2.7</id>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-submarine-tony-runtime</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</profile>
</profiles>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<id>dist</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<finalName>${project.artifactId}-${project.version}-${project.activeProfiles[0].id}</finalName>
<appendAssemblyId>false</appendAssemblyId>
<attach>false</attach>
<descriptors>
<descriptor>src/assembly/distribution.xml</descriptor>
</descriptors>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,61 @@
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.0.0 http://maven.apache.org/xsd/assembly-2.0.0.xsd">
<id>distribution</id>
<formats>
<format>dir</format>
<format>tar.gz</format>
</formats>
<dependencySets>
<dependencySet>
<outputDirectory>/</outputDirectory>
<includes>
<include>org.apache.hadoop:hadoop-submarine-core</include>
</includes>
</dependencySet>
<dependencySet>
<outputDirectory>/</outputDirectory>
<includes>
<include>org.apache.hadoop:hadoop-submarine-tony-runtime</include>
</includes>
</dependencySet>
<dependencySet>
<outputDirectory>/</outputDirectory>
<includes>
<include>org.apache.hadoop:hadoop-submarine-yarnservice-runtime</include>
</includes>
</dependencySet>
</dependencySets>
<fileSets>
<fileSet>
<directory>../../</directory>
<includes>
<include>LICENSE*</include>
<include>NOTICE*</include>
</includes>
</fileSet>
<fileSet>
<directory>../hadoop-submarine-all/target/</directory>
<outputDirectory>/</outputDirectory>
<includes>
<include>hadoop-submarine-all-${project.version}-*.jar</include>
</includes>
</fileSet>
</fileSets>
</assembly>

View File

@ -23,6 +23,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>hadoop-submarine-tony-runtime</artifactId> <artifactId>hadoop-submarine-tony-runtime</artifactId>
<name>Hadoop Submarine Tony Runtime</name>
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>

View File

@ -19,9 +19,8 @@ import com.linkedin.tony.util.Utils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters; import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@ -52,7 +51,7 @@ public final class TonyUtils {
tonyConf.setLong( tonyConf.setLong(
TonyConfigurationKeys.getResourceKey(Constants.PS_JOB_NAME, TonyConfigurationKeys.getResourceKey(Constants.PS_JOB_NAME,
Constants.MEMORY), Constants.MEMORY),
parameters.getPsResource().getMemorySize()); ResourceUtils.getMemorySize(parameters.getPsResource()));
} }
if (parameters.getWorkerResource() != null) { if (parameters.getWorkerResource() != null) {
tonyConf.setInt( tonyConf.setInt(
@ -62,16 +61,12 @@ public final class TonyUtils {
tonyConf.setLong( tonyConf.setLong(
TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME, TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
Constants.MEMORY), Constants.MEMORY),
parameters.getWorkerResource().getMemorySize()); ResourceUtils.getMemorySize(parameters.getWorkerResource()));
try {
tonyConf.setLong( tonyConf.setLong(
TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME, TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
Constants.GPUS), Constants.GPUS),
parameters.getWorkerResource() ResourceUtils.getResourceValue(parameters.getWorkerResource(),
.getResourceValue(ResourceInformation.GPU_URI)); ResourceUtils.GPU_URI));
} catch (ResourceNotFoundException rnfe) {
LOG.error("GPU resources not enabled.");
}
} }
if (parameters.getQueue() != null) { if (parameters.getQueue() != null) {
tonyConf.set( tonyConf.set(

View File

@ -22,7 +22,7 @@
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<version>0.2.0-SNAPSHOT</version> <version>0.2.0-SNAPSHOT</version>
</parent> </parent>
<artifactId>hadoop-submarine-score-yarnservice-runtime</artifactId> <artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
<version>0.2.0-SNAPSHOT</version> <version>0.2.0-SNAPSHOT</version>
<name>Hadoop Submarine YARN Service Runtime</name> <name>Hadoop Submarine YARN Service Runtime</name>
@ -108,12 +108,10 @@
<dependency> <dependency>
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-services-api</artifactId> <artifactId>hadoop-yarn-services-api</artifactId>
<version>3.3.0-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-services-core</artifactId> <artifactId>hadoop-yarn-services-core</artifactId>
<version>3.3.0-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>

View File

@ -30,7 +30,6 @@ import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchComma
import java.io.IOException; import java.io.IOException;
import java.util.Objects; import java.util.Objects;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConstants.CONTAINER_STATE_REPORT_AS_SERVICE_STATE;
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.addCommonEnvironments; import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.addCommonEnvironments;
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getScriptFileName; import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getScriptFileName;
import static org.apache.hadoop.yarn.submarine.utils.DockerUtilities.getDockerArtifact; import static org.apache.hadoop.yarn.submarine.utils.DockerUtilities.getDockerArtifact;
@ -85,8 +84,11 @@ public abstract class AbstractComponent {
if (role.equals(TensorFlowRole.PRIMARY_WORKER) || if (role.equals(TensorFlowRole.PRIMARY_WORKER) ||
role.equals(PyTorchRole.PRIMARY_WORKER)) { role.equals(PyTorchRole.PRIMARY_WORKER)) {
component.setNumberOfContainers(1L); component.setNumberOfContainers(1L);
// If the dependencies are upgraded to hadoop 3.3.0.
// yarn.service.container-state-report-as-service-state can be replaced
// with CONTAINER_STATE_REPORT_AS_SERVICE_STATE
component.getConfiguration().setProperty( component.getConfiguration().setProperty(
CONTAINER_STATE_REPORT_AS_SERVICE_STATE, "true"); "yarn.service.container-state-report-as-service-state", "true");
} else { } else {
component.setNumberOfContainers( component.setNumberOfContainers(
(long) parameters.getNumWorkers() - 1); (long) parameters.getNumWorkers() - 1);

View File

@ -33,7 +33,6 @@ import java.io.IOException;
import java.util.Map; import java.util.Map;
import static junit.framework.TestCase.assertTrue; import static junit.framework.TestCase.assertTrue;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConstants.CONTAINER_STATE_REPORT_AS_SERVICE_STATE;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertFalse;
import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.any;
@ -209,8 +208,11 @@ public class TestTensorFlowWorkerComponent {
Component component = workerComponent.createComponent(); Component component = workerComponent.createComponent();
assertEquals(1L, (long) component.getNumberOfContainers()); assertEquals(1L, (long) component.getNumberOfContainers());
// If the dependencies are upgraded to hadoop 3.3.0.
// yarn.service.container-state-report-as-service-state can be replaced
// with CONTAINER_STATE_REPORT_AS_SERVICE_STATE
verifyCommons(component, ImmutableMap.of( verifyCommons(component, ImmutableMap.of(
CONTAINER_STATE_REPORT_AS_SERVICE_STATE, "true")); "yarn.service.container-state-report-as-service-state", "true"));
} }
} }

View File

@ -19,12 +19,17 @@ package org.apache.hadoop.yarn.submarine.utils;
import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.LocalConfigurationProvider;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.service.api.records.ResourceInformation; import org.apache.hadoop.yarn.service.api.records.ResourceInformation;
import org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationProvider;
import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.junit.After; import org.junit.After;
import org.junit.Test; import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map; import java.util.Map;
import static org.junit.Assert.*; import static org.junit.Assert.*;
@ -33,13 +38,48 @@ import static org.junit.Assert.*;
* This class is to test {@link SubmarineResourceUtils}. * This class is to test {@link SubmarineResourceUtils}.
*/ */
public class TestSubmarineResourceUtils { public class TestSubmarineResourceUtils {
/**
* With the dependencies of hadoop 3.2.0, Need to create a
* CustomResourceTypesConfigurationProvider implementations. If the
* dependencies are upgraded to hadoop 3.3.0. It can be replaced by
* org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationProvi-
* der
*/
private static class CustomResourceTypesConfigurationProvider
extends LocalConfigurationProvider {
@Override
public InputStream getConfigurationInputStream(Configuration bootstrapConf,
String name) throws YarnException, IOException {
if (YarnConfiguration.RESOURCE_TYPES_CONFIGURATION_FILE.equals(name)) {
return new ByteArrayInputStream(
("<configuration>\n" +
" <property>\n" +
" <name>yarn.resource-types</name>\n" +
" <value>" + CUSTOM_RESOURCE_NAME + "</value>\n" +
" </property>\n" +
" <property>\n" +
" <name>yarn.resource-types.a-custom-resource.units</name>\n"
+
" <value>G</value>\n" +
" </property>\n" +
"</configuration>\n").getBytes());
} else {
return super.getConfigurationInputStream(bootstrapConf, name);
}
}
}
private static final String CUSTOM_RESOURCE_NAME = "a-custom-resource"; private static final String CUSTOM_RESOURCE_NAME = "a-custom-resource";
private void initResourceTypes() { private void initResourceTypes() {
CustomResourceTypesConfigurationProvider.initResourceTypes( // If the dependencies are upgraded to hadoop 3.3.0. It can be replaced by
ImmutableMap.<String, String>builder() // org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationPro-
.put(CUSTOM_RESOURCE_NAME, "G") // vider
.build()); Configuration configuration = new Configuration();
configuration.set(YarnConfiguration.RM_CONFIGURATION_PROVIDER_CLASS,
CustomResourceTypesConfigurationProvider.class.getName());
ResourceUtils.resetResourceTypes(configuration);
} }
@After @After

View File

@ -20,7 +20,7 @@
<parent> <parent>
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-project</artifactId> <artifactId>hadoop-project</artifactId>
<version>3.3.0-SNAPSHOT</version> <version>3.2.0</version>
<relativePath/> <relativePath/>
</parent> </parent>
<artifactId>hadoop-submarine</artifactId> <artifactId>hadoop-submarine</artifactId>
@ -32,15 +32,111 @@
<hadoop.common.build.dir>${basedir}/../hadoop-common-project/hadoop-common/target</hadoop.common.build.dir> <hadoop.common.build.dir>${basedir}/../hadoop-common-project/hadoop-common/target</hadoop.common.build.dir>
</properties> </properties>
<!-- Do not add dependencies here, add them to the POM of the leaf module -->
<modules> <modules>
<module>hadoop-submarine-core</module> <module>hadoop-submarine-core</module>
<module>hadoop-submarine-all</module>
<module>hadoop-submarine-dist</module>
</modules>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>2.23.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-services-api</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-api</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.7</version>
</dependency>
</dependencies>
</dependencyManagement>
<profiles>
<profile>
<id>hadoop-3.2</id>
<properties>
<hadoop.version>3.2.0</hadoop.version>
</properties>
<modules>
<module>hadoop-submarine-yarnservice-runtime</module> <module>hadoop-submarine-yarnservice-runtime</module>
<module>hadoop-submarine-tony-runtime</module> <module>hadoop-submarine-tony-runtime</module>
</modules> </modules>
</profile>
<!-- Default profile-->
<profile>
<id>hadoop-3.1</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<properties>
<hadoop.version>3.1.2</hadoop.version>
</properties>
<modules>
<module>hadoop-submarine-yarnservice-runtime</module>
<module>hadoop-submarine-tony-runtime</module>
</modules>
</profile>
<profile>
<id>hadoop-2.9</id>
<properties>
<hadoop.version>2.9.2</hadoop.version>
</properties>
<modules>
<module>hadoop-submarine-tony-runtime</module>
</modules>
</profile>
<profile>
<id>hadoop-2.7</id>
<properties>
<hadoop.version>2.7.3</hadoop.version>
</properties>
<modules>
<module>hadoop-submarine-tony-runtime</module>
</modules>
</profile>
<profiles>
<profile> <profile>
<id>clover</id> <id>clover</id>
<activation> <activation>
@ -57,4 +153,5 @@
</dependencies> </dependencies>
</profile> </profile>
</profiles> </profiles>
</project> </project>