SUBMARINE-58. Submarine client needs to generate fat jar. Contributed by Zac Zhou.
This commit is contained in:
parent
732133cb2a
commit
729ccb2cab
183
hadoop-submarine/hadoop-submarine-all/pom.xml
Normal file
183
hadoop-submarine/hadoop-submarine-all/pom.xml
Normal file
@ -0,0 +1,183 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<!--
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. See accompanying LICENSE file.
|
||||||
|
-->
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
|
||||||
|
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<parent>
|
||||||
|
<artifactId>hadoop-submarine</artifactId>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<version>0.2.0-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<artifactId>${project.artifactId}</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<name>Hadoop Submarine All</name>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<!-- Needed for generating FindBugs warnings using parent pom -->
|
||||||
|
<yarn.basedir>${project.parent.parent.basedir}</yarn.basedir>
|
||||||
|
<project.artifactId>hadoop-submarine-all</project.artifactId>
|
||||||
|
<project.version>0.2.0-SNAPSHOT</project.version>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
|
||||||
|
<!-- Dependencies for Hadoop commons -->
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-hdfs</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-common</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-submarine-core</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<profiles>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>hadoop-3.2</id>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-hdfs-client</artifactId>
|
||||||
|
<version>${hadoop.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<!-- Default profile-->
|
||||||
|
<profile>
|
||||||
|
<id>hadoop-3.1</id>
|
||||||
|
<activation>
|
||||||
|
<activeByDefault>true</activeByDefault>
|
||||||
|
</activation>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-hdfs-client</artifactId>
|
||||||
|
<version>${hadoop.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>hadoop-2.9</id>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-hdfs-client</artifactId>
|
||||||
|
<version>${hadoop.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>hadoop-2.7</id>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</profile>
|
||||||
|
</profiles>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-shade-plugin</artifactId>
|
||||||
|
<version>3.2.1</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>package</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>shade</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<!--
|
||||||
|
<shadedArtifactAttached>true</shadedArtifactAttached>
|
||||||
|
<shadedClassifierName>with-all-dependencies</shadedClassifierName>
|
||||||
|
-->
|
||||||
|
<outputFile>target/${project.artifactId}-${project.version}-${project.activeProfiles[0].id}.jar</outputFile>
|
||||||
|
<artifactSet>
|
||||||
|
<excludes>
|
||||||
|
<exclude>classworlds:classworlds</exclude>
|
||||||
|
<exclude>junit:junit</exclude>
|
||||||
|
<exclude>jmock:*</exclude>
|
||||||
|
<exclude>*:xml-apis</exclude>
|
||||||
|
<exclude>org.apache.maven:lib:tests</exclude>
|
||||||
|
</excludes>
|
||||||
|
</artifactSet>
|
||||||
|
<filters>
|
||||||
|
<filter>
|
||||||
|
<artifact>*:*</artifact>
|
||||||
|
<excludes>
|
||||||
|
<exclude>META-INF/*.SF</exclude>
|
||||||
|
<exclude>META-INF/*.DSA</exclude>
|
||||||
|
<exclude>META-INF/*.RSA</exclude>
|
||||||
|
</excludes>
|
||||||
|
</filter>
|
||||||
|
</filters>
|
||||||
|
<transformers>
|
||||||
|
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
|
||||||
|
<mainClass>org.apache.hadoop.yarn.submarine.client.cli.Cli</mainClass>
|
||||||
|
</transformer>
|
||||||
|
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
|
||||||
|
</transformers>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
</project>
|
@ -67,6 +67,10 @@
|
|||||||
<groupId>org.yaml</groupId>
|
<groupId>org.yaml</groupId>
|
||||||
<artifactId>snakeyaml</artifactId>
|
<artifactId>snakeyaml</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>commons-lang3</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<!-- Dependencies for Hadoop commons -->
|
<!-- Dependencies for Hadoop commons -->
|
||||||
|
|
||||||
|
@ -45,7 +45,7 @@ import org.apache.hadoop.yarn.submarine.client.cli.runjob.RoleParameters;
|
|||||||
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
import org.apache.hadoop.yarn.submarine.common.api.TensorFlowRole;
|
import org.apache.hadoop.yarn.submarine.common.api.TensorFlowRole;
|
||||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
|
||||||
import org.yaml.snakeyaml.introspector.Property;
|
import org.yaml.snakeyaml.introspector.Property;
|
||||||
import org.yaml.snakeyaml.introspector.PropertyUtils;
|
import org.yaml.snakeyaml.introspector.PropertyUtils;
|
||||||
|
|
||||||
@ -271,8 +271,7 @@ public abstract class RunJobParameters extends RunParameters {
|
|||||||
throw new ParseException(
|
throw new ParseException(
|
||||||
"--" + CliConstants.WORKER_RES + " is absent.");
|
"--" + CliConstants.WORKER_RES + " is absent.");
|
||||||
}
|
}
|
||||||
return ResourceUtils.createResourceFromString(workerResourceStr,
|
return ResourceUtils.createResourceFromString(workerResourceStr);
|
||||||
clientContext.getOrCreateYarnClient().getResourceTypeInfo());
|
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -27,7 +27,7 @@ import org.apache.hadoop.yarn.submarine.client.cli.param.ParametersHolder;
|
|||||||
import org.apache.hadoop.yarn.submarine.client.cli.runjob.RoleParameters;
|
import org.apache.hadoop.yarn.submarine.client.cli.runjob.RoleParameters;
|
||||||
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
import org.apache.hadoop.yarn.submarine.common.api.TensorFlowRole;
|
import org.apache.hadoop.yarn.submarine.common.api.TensorFlowRole;
|
||||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -127,8 +127,7 @@ public class TensorFlowRunJobParameters extends RunJobParameters {
|
|||||||
if (psResourceStr == null) {
|
if (psResourceStr == null) {
|
||||||
throw new ParseException("--" + CliConstants.PS_RES + " is absent.");
|
throw new ParseException("--" + CliConstants.PS_RES + " is absent.");
|
||||||
}
|
}
|
||||||
return ResourceUtils.createResourceFromString(psResourceStr,
|
return ResourceUtils.createResourceFromString(psResourceStr);
|
||||||
clientContext.getOrCreateYarnClient().getResourceTypeInfo());
|
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@ -151,9 +150,8 @@ public class TensorFlowRunJobParameters extends RunJobParameters {
|
|||||||
if (tensorboardResourceStr == null || tensorboardResourceStr.isEmpty()) {
|
if (tensorboardResourceStr == null || tensorboardResourceStr.isEmpty()) {
|
||||||
tensorboardResourceStr = CliConstants.TENSORBOARD_DEFAULT_RESOURCES;
|
tensorboardResourceStr = CliConstants.TENSORBOARD_DEFAULT_RESOURCES;
|
||||||
}
|
}
|
||||||
Resource tensorboardResource =
|
Resource tensorboardResource = ResourceUtils.createResourceFromString(
|
||||||
ResourceUtils.createResourceFromString(tensorboardResourceStr,
|
tensorboardResourceStr);
|
||||||
clientContext.getOrCreateYarnClient().getResourceTypeInfo());
|
|
||||||
String tensorboardDockerImage =
|
String tensorboardDockerImage =
|
||||||
parametersHolder.getOptionValue(CliConstants.TENSORBOARD_DOCKER_IMAGE);
|
parametersHolder.getOptionValue(CliConstants.TENSORBOARD_DOCKER_IMAGE);
|
||||||
return new RoleParameters(TensorFlowRole.TENSORBOARD, 1, null,
|
return new RoleParameters(TensorFlowRole.TENSORBOARD, 1, null,
|
||||||
|
@ -0,0 +1,332 @@
|
|||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.resource;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.lang.reflect.Method;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class implements some methods with the almost the same logic as
|
||||||
|
* org.apache.hadoop.yarn.util.resource.ResourceUtils of hadoop 3.3.
|
||||||
|
* If the hadoop dependencies are upgraded to 3.3, this class can be refactored
|
||||||
|
* with org.apache.hadoop.yarn.util.resource.ResourceUtils.
|
||||||
|
*/
|
||||||
|
public final class ResourceUtils {
|
||||||
|
|
||||||
|
private final static String RES_PATTERN = "^[^=]+=\\d+\\s?\\w*$";
|
||||||
|
private final static String SET_RESOURCE_VALUE_METHOD = "setResourceValue";
|
||||||
|
private final static String SET_MEMORY_SIZE_METHOD = "setMemorySize";
|
||||||
|
private final static String DEPRECATED_SET_MEMORY_SIZE_METHOD =
|
||||||
|
"setMemory";
|
||||||
|
private final static String GET_MEMORY_SIZE_METHOD = "getMemorySize";
|
||||||
|
private final static String DEPRECATED_GET_MEMORY_SIZE_METHOD =
|
||||||
|
"getMemory";
|
||||||
|
private final static String GET_RESOURCE_VALUE_METHOD = "getResourceValue";
|
||||||
|
private final static String GET_RESOURCE_TYPE_METHOD =
|
||||||
|
"getResourcesTypeInfo";
|
||||||
|
private final static String REINITIALIZE_RESOURCES_METHOD =
|
||||||
|
"reinitializeResources";
|
||||||
|
public static final String MEMORY_URI = "memory-mb";
|
||||||
|
public static final String VCORES_URI = "vcores";
|
||||||
|
public static final String GPU_URI = "yarn.io/gpu";
|
||||||
|
public static final String FPGA_URI = "yarn.io/fpga";
|
||||||
|
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(ResourceUtils.class);
|
||||||
|
|
||||||
|
private ResourceUtils() {}
|
||||||
|
|
||||||
|
public static Resource createResourceFromString(String resourceStr) {
|
||||||
|
Map<String, Long> typeToValue = parseResourcesString(resourceStr);
|
||||||
|
Resource resource = Resource.newInstance(0, 0);
|
||||||
|
for (Map.Entry<String, Long> entry : typeToValue.entrySet()) {
|
||||||
|
if(entry.getKey().equals(VCORES_URI)) {
|
||||||
|
resource.setVirtualCores(entry.getValue().intValue());
|
||||||
|
continue;
|
||||||
|
} else if (entry.getKey().equals(MEMORY_URI)) {
|
||||||
|
setMemorySize(resource, entry.getValue());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
setResource(resource, entry.getKey(), entry.getValue().intValue());
|
||||||
|
}
|
||||||
|
return resource;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Map<String, Long> parseResourcesString(String resourcesStr) {
|
||||||
|
Map<String, Long> resources = new HashMap<>();
|
||||||
|
String[] pairs = resourcesStr.trim().split(",");
|
||||||
|
for (String resource : pairs) {
|
||||||
|
resource = resource.trim();
|
||||||
|
if (!resource.matches(RES_PATTERN)) {
|
||||||
|
throw new IllegalArgumentException("\"" + resource + "\" is not a "
|
||||||
|
+ "valid resource type/amount pair. "
|
||||||
|
+ "Please provide key=amount pairs separated by commas.");
|
||||||
|
}
|
||||||
|
String[] splits = resource.split("=");
|
||||||
|
String key = splits[0], value = splits[1];
|
||||||
|
String units = getUnits(value);
|
||||||
|
|
||||||
|
String valueWithoutUnit = value.substring(0,
|
||||||
|
value.length()- units.length()).trim();
|
||||||
|
long resourceValue = Long.parseLong(valueWithoutUnit);
|
||||||
|
|
||||||
|
// Convert commandline unit to standard YARN unit.
|
||||||
|
if (units.equals("M") || units.equals("m")) {
|
||||||
|
units = "Mi";
|
||||||
|
} else if (units.equals("G") || units.equals("g")) {
|
||||||
|
units = "Gi";
|
||||||
|
} else if (!units.isEmpty()){
|
||||||
|
throw new IllegalArgumentException("Acceptable units are M/G or empty");
|
||||||
|
}
|
||||||
|
|
||||||
|
// special handle memory-mb and memory
|
||||||
|
if (key.equals(MEMORY_URI)) {
|
||||||
|
if (!units.isEmpty()) {
|
||||||
|
resourceValue = UnitsConversionUtil.convert(units, "Mi",
|
||||||
|
resourceValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (key.equals("memory")) {
|
||||||
|
key = MEMORY_URI;
|
||||||
|
resourceValue = UnitsConversionUtil.convert(units, "Mi",
|
||||||
|
resourceValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
// special handle gpu
|
||||||
|
if (key.equals("gpu")) {
|
||||||
|
key = GPU_URI;
|
||||||
|
}
|
||||||
|
|
||||||
|
// special handle fpga
|
||||||
|
if (key.equals("fpga")) {
|
||||||
|
key = FPGA_URI;
|
||||||
|
}
|
||||||
|
|
||||||
|
resources.put(key, resourceValue);
|
||||||
|
}
|
||||||
|
return resources;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* As hadoop 2.9.2 and lower don't support resources except cpu and memory.
|
||||||
|
* Use reflection to set GPU or other resources for compatibility with
|
||||||
|
* hadoop 2.9.2
|
||||||
|
*/
|
||||||
|
public static void setResource(Resource resource, String resourceName,
|
||||||
|
int resourceValue) {
|
||||||
|
try {
|
||||||
|
Method method = resource.getClass().getMethod(SET_RESOURCE_VALUE_METHOD,
|
||||||
|
String.class, long.class);
|
||||||
|
method.invoke(resource, resourceName, resourceValue);
|
||||||
|
} catch (NoSuchMethodException e) {
|
||||||
|
LOG.error("There is no '" + SET_RESOURCE_VALUE_METHOD + "' API in this" +
|
||||||
|
"version of YARN", e);
|
||||||
|
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||||
|
} catch (IllegalAccessException | InvocationTargetException e) {
|
||||||
|
LOG.error("Failed to invoke '" + SET_RESOURCE_VALUE_METHOD +
|
||||||
|
"' method to set GPU resources", e);
|
||||||
|
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void setMemorySize(Resource resource, Long memorySize) {
|
||||||
|
boolean useWithIntParameter = false;
|
||||||
|
// For hadoop 2.9.2 and above
|
||||||
|
try {
|
||||||
|
Method method = resource.getClass().getMethod(SET_MEMORY_SIZE_METHOD,
|
||||||
|
long.class);
|
||||||
|
method.setAccessible(true);
|
||||||
|
method.invoke(resource, memorySize);
|
||||||
|
} catch (NoSuchMethodException nsme) {
|
||||||
|
LOG.info("There is no '" + SET_MEMORY_SIZE_METHOD + "(long)' API in" +
|
||||||
|
" this version of YARN");
|
||||||
|
useWithIntParameter = true;
|
||||||
|
} catch (IllegalAccessException | InvocationTargetException e) {
|
||||||
|
LOG.error("Failed to invoke '" + SET_MEMORY_SIZE_METHOD +
|
||||||
|
"' method", e);
|
||||||
|
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||||
|
}
|
||||||
|
// For hadoop 2.7.3
|
||||||
|
if (useWithIntParameter) {
|
||||||
|
try {
|
||||||
|
LOG.info("Trying to use '" + DEPRECATED_SET_MEMORY_SIZE_METHOD +
|
||||||
|
"(int)' API for this version of YARN");
|
||||||
|
Method method = resource.getClass().getMethod(
|
||||||
|
DEPRECATED_SET_MEMORY_SIZE_METHOD, int.class);
|
||||||
|
method.invoke(resource, memorySize.intValue());
|
||||||
|
} catch (NoSuchMethodException e) {
|
||||||
|
LOG.error("There is no '" + DEPRECATED_SET_MEMORY_SIZE_METHOD +
|
||||||
|
"(int)' API in this version of YARN", e);
|
||||||
|
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||||
|
} catch (IllegalAccessException | InvocationTargetException e) {
|
||||||
|
LOG.error("Failed to invoke '" + DEPRECATED_SET_MEMORY_SIZE_METHOD +
|
||||||
|
"' method", e);
|
||||||
|
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long getMemorySize(Resource resource) {
|
||||||
|
boolean useWithIntParameter = false;
|
||||||
|
long memory = 0;
|
||||||
|
// For hadoop 2.9.2 and above
|
||||||
|
try {
|
||||||
|
Method method = resource.getClass().getMethod(GET_MEMORY_SIZE_METHOD);
|
||||||
|
method.setAccessible(true);
|
||||||
|
memory = (long) method.invoke(resource);
|
||||||
|
} catch (NoSuchMethodException e) {
|
||||||
|
LOG.info("There is no '" + GET_MEMORY_SIZE_METHOD + "' API in" +
|
||||||
|
" this version of YARN");
|
||||||
|
useWithIntParameter = true;
|
||||||
|
} catch (IllegalAccessException | InvocationTargetException e) {
|
||||||
|
LOG.error("Failed to invoke '" + GET_MEMORY_SIZE_METHOD +
|
||||||
|
"' method", e);
|
||||||
|
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||||
|
}
|
||||||
|
// For hadoop 2.7.3
|
||||||
|
if (useWithIntParameter) {
|
||||||
|
try {
|
||||||
|
LOG.info("Trying to use '" + DEPRECATED_GET_MEMORY_SIZE_METHOD +
|
||||||
|
"' API for this version of YARN");
|
||||||
|
Method method = resource.getClass().getMethod(
|
||||||
|
DEPRECATED_GET_MEMORY_SIZE_METHOD);
|
||||||
|
method.setAccessible(true);
|
||||||
|
memory = ((Integer) method.invoke(resource)).longValue();
|
||||||
|
} catch (NoSuchMethodException e) {
|
||||||
|
LOG.error("There is no '" + DEPRECATED_GET_MEMORY_SIZE_METHOD +
|
||||||
|
"' API in this version of YARN", e);
|
||||||
|
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||||
|
} catch (IllegalAccessException | InvocationTargetException e) {
|
||||||
|
LOG.error("Failed to invoke '" + DEPRECATED_GET_MEMORY_SIZE_METHOD +
|
||||||
|
"' method", e);
|
||||||
|
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return memory;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* As hadoop 2.9.2 and lower don't support resources except cpu and memory.
|
||||||
|
* Use reflection to set GPU or other resources for compatibility with
|
||||||
|
* hadoop 2.9.2
|
||||||
|
*/
|
||||||
|
public static long getResourceValue(Resource resource, String resourceName) {
|
||||||
|
long resourceValue = 0;
|
||||||
|
try {
|
||||||
|
Method method = resource.getClass().getMethod(GET_RESOURCE_VALUE_METHOD,
|
||||||
|
String.class);
|
||||||
|
Object value = method.invoke(resource, resourceName);
|
||||||
|
resourceValue = (long) value;
|
||||||
|
} catch (NoSuchMethodException e) {
|
||||||
|
LOG.info("There is no '" + GET_RESOURCE_VALUE_METHOD + "' API in this" +
|
||||||
|
" version of YARN");
|
||||||
|
} catch (InvocationTargetException e) {
|
||||||
|
if (e.getTargetException().getClass().getName().equals(
|
||||||
|
"org.apache.hadoop.yarn.exceptions.ResourceNotFoundException")) {
|
||||||
|
LOG.info("Not found resource " + resourceName);
|
||||||
|
} else {
|
||||||
|
LOG.info("Failed to invoke '" + GET_RESOURCE_VALUE_METHOD + "'" +
|
||||||
|
" method to get resource " + resourceName);
|
||||||
|
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||||
|
}
|
||||||
|
} catch (IllegalAccessException | ClassCastException e) {
|
||||||
|
LOG.error("Failed to invoke '" + GET_RESOURCE_VALUE_METHOD +
|
||||||
|
"' method to get resource " + resourceName, e);
|
||||||
|
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||||
|
}
|
||||||
|
return resourceValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* As hadoop 2.9.2 and lower don't support resources except cpu and memory.
|
||||||
|
* Use reflection to add GPU or other resources for compatibility with
|
||||||
|
* hadoop 2.9.2
|
||||||
|
*/
|
||||||
|
public static void configureResourceType(String resrouceName) {
|
||||||
|
Class resourceTypeInfo;
|
||||||
|
try{
|
||||||
|
resourceTypeInfo = Class.forName(
|
||||||
|
"org.apache.hadoop.yarn.api.records.ResourceTypeInfo");
|
||||||
|
Class resourceUtils = Class.forName(
|
||||||
|
"org.apache.hadoop.yarn.util.resource.ResourceUtils");
|
||||||
|
Method method = resourceUtils.getMethod(GET_RESOURCE_TYPE_METHOD);
|
||||||
|
Object resTypes = method.invoke(null);
|
||||||
|
|
||||||
|
Method resourceTypeInstance = resourceTypeInfo.getMethod("newInstance",
|
||||||
|
String.class, String.class);
|
||||||
|
Object resourceType = resourceTypeInstance.invoke(null, resrouceName, "");
|
||||||
|
((ArrayList)resTypes).add(resourceType);
|
||||||
|
|
||||||
|
Method reInitialMethod = resourceUtils.getMethod(
|
||||||
|
REINITIALIZE_RESOURCES_METHOD, List.class);
|
||||||
|
reInitialMethod.invoke(null, resTypes);
|
||||||
|
|
||||||
|
} catch (ClassNotFoundException e) {
|
||||||
|
LOG.info("There is no specified class API in this" +
|
||||||
|
" version of YARN");
|
||||||
|
LOG.info(e.getMessage());
|
||||||
|
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||||
|
} catch (NoSuchMethodException nsme) {
|
||||||
|
LOG.info("There is no '" + GET_RESOURCE_VALUE_METHOD + "' API in this" +
|
||||||
|
" version of YARN");
|
||||||
|
} catch (IllegalAccessException | InvocationTargetException e) {
|
||||||
|
LOG.info("Failed to invoke 'configureResourceType' method ", e);
|
||||||
|
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getUnits(String resourceValue) {
|
||||||
|
return parseResourceValue(resourceValue)[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract unit and actual value from resource value.
|
||||||
|
* @param resourceValue Value of the resource
|
||||||
|
* @return Array containing unit and value. [0]=unit, [1]=value
|
||||||
|
* @throws IllegalArgumentException if units contain non alpha characters
|
||||||
|
*/
|
||||||
|
private static String[] parseResourceValue(String resourceValue) {
|
||||||
|
String[] resource = new String[2];
|
||||||
|
int i = 0;
|
||||||
|
for (; i < resourceValue.length(); i++) {
|
||||||
|
if (Character.isAlphabetic(resourceValue.charAt(i))) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
String units = resourceValue.substring(i);
|
||||||
|
|
||||||
|
if (StringUtils.isAlpha(units) || units.equals("")) {
|
||||||
|
resource[0] = units;
|
||||||
|
resource[1] = resourceValue.substring(0, i);
|
||||||
|
return resource;
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("Units '" + units + "'"
|
||||||
|
+ " contains non alphabet characters, which is not allowed.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,164 @@
|
|||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.resource;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Almost the same logic as UnitsConversionUtil[YARN-4081]. If the dependencies
|
||||||
|
* are upgraded to hadoop 3.*, this class can be replaced.
|
||||||
|
*/
|
||||||
|
public final class UnitsConversionUtil {
|
||||||
|
|
||||||
|
private UnitsConversionUtil() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper class for encapsulating conversion values.
|
||||||
|
*/
|
||||||
|
public static class Converter {
|
||||||
|
private long numerator;
|
||||||
|
private long denominator;
|
||||||
|
|
||||||
|
Converter(long n, long d) {
|
||||||
|
this.numerator = n;
|
||||||
|
this.denominator = d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final String[] UNITS = {"p", "n", "u", "m", "", "k", "M", "G",
|
||||||
|
"T", "P", "Ki", "Mi", "Gi", "Ti", "Pi"};
|
||||||
|
private static final List<String> SORTED_UNITS = Arrays.asList(UNITS);
|
||||||
|
public static final Set<String> KNOWN_UNITS = createKnownUnitsSet();
|
||||||
|
private static final Converter PICO =
|
||||||
|
new Converter(1L, 1000L * 1000L * 1000L * 1000L);
|
||||||
|
private static final Converter NANO =
|
||||||
|
new Converter(1L, 1000L * 1000L * 1000L);
|
||||||
|
private static final Converter MICRO = new Converter(1L, 1000L * 1000L);
|
||||||
|
private static final Converter MILLI = new Converter(1L, 1000L);
|
||||||
|
private static final Converter BASE = new Converter(1L, 1L);
|
||||||
|
private static final Converter KILO = new Converter(1000L, 1L);
|
||||||
|
private static final Converter MEGA = new Converter(1000L * 1000L, 1L);
|
||||||
|
private static final Converter GIGA =
|
||||||
|
new Converter(1000L * 1000L * 1000L, 1L);
|
||||||
|
private static final Converter TERA =
|
||||||
|
new Converter(1000L * 1000L * 1000L * 1000L, 1L);
|
||||||
|
private static final Converter PETA =
|
||||||
|
new Converter(1000L * 1000L * 1000L * 1000L * 1000L, 1L);
|
||||||
|
|
||||||
|
private static final Converter KILO_BINARY = new Converter(1024L, 1L);
|
||||||
|
private static final Converter MEGA_BINARY = new Converter(1024L * 1024L, 1L);
|
||||||
|
private static final Converter GIGA_BINARY =
|
||||||
|
new Converter(1024L * 1024L * 1024L, 1L);
|
||||||
|
private static final Converter TERA_BINARY =
|
||||||
|
new Converter(1024L * 1024L * 1024L * 1024L, 1L);
|
||||||
|
private static final Converter PETA_BINARY =
|
||||||
|
new Converter(1024L * 1024L * 1024L * 1024L * 1024L, 1L);
|
||||||
|
|
||||||
|
private static Set<String> createKnownUnitsSet() {
|
||||||
|
Set<String> ret = new HashSet<>();
|
||||||
|
ret.addAll(Arrays.asList(UNITS));
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Converter getConverter(String unit) {
|
||||||
|
switch (unit) {
|
||||||
|
case "p":
|
||||||
|
return PICO;
|
||||||
|
case "n":
|
||||||
|
return NANO;
|
||||||
|
case "u":
|
||||||
|
return MICRO;
|
||||||
|
case "m":
|
||||||
|
return MILLI;
|
||||||
|
case "":
|
||||||
|
return BASE;
|
||||||
|
case "k":
|
||||||
|
return KILO;
|
||||||
|
case "M":
|
||||||
|
return MEGA;
|
||||||
|
case "G":
|
||||||
|
return GIGA;
|
||||||
|
case "T":
|
||||||
|
return TERA;
|
||||||
|
case "P":
|
||||||
|
return PETA;
|
||||||
|
case "Ki":
|
||||||
|
return KILO_BINARY;
|
||||||
|
case "Mi":
|
||||||
|
return MEGA_BINARY;
|
||||||
|
case "Gi":
|
||||||
|
return GIGA_BINARY;
|
||||||
|
case "Ti":
|
||||||
|
return TERA_BINARY;
|
||||||
|
case "Pi":
|
||||||
|
return PETA_BINARY;
|
||||||
|
default:
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"Unknown unit '" + unit + "'. Known units are " + KNOWN_UNITS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a value from one unit to another. Supported units can be obtained
|
||||||
|
* by inspecting the KNOWN_UNITS set.
|
||||||
|
*
|
||||||
|
* @param fromUnit the unit of the from value
|
||||||
|
* @param toUnit the target unit
|
||||||
|
* @param fromValue the value you wish to convert
|
||||||
|
* @return the value in toUnit
|
||||||
|
*/
|
||||||
|
public static long convert(String fromUnit, String toUnit, long fromValue) {
|
||||||
|
if (toUnit == null || fromUnit == null) {
|
||||||
|
throw new IllegalArgumentException("One or more arguments are null");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fromUnit.equals(toUnit)) {
|
||||||
|
return fromValue;
|
||||||
|
}
|
||||||
|
Converter fc = getConverter(fromUnit);
|
||||||
|
Converter tc = getConverter(toUnit);
|
||||||
|
long numerator = fc.numerator * tc.denominator;
|
||||||
|
long denominator = fc.denominator * tc.numerator;
|
||||||
|
long numeratorMultiplierLimit = Long.MAX_VALUE / numerator;
|
||||||
|
if (numerator < denominator) {
|
||||||
|
if (numeratorMultiplierLimit < fromValue) {
|
||||||
|
String overflowMsg =
|
||||||
|
"Converting " + fromValue + " from '" + fromUnit + "' to '" + toUnit
|
||||||
|
+ "' will result in an overflow of Long";
|
||||||
|
throw new IllegalArgumentException(overflowMsg);
|
||||||
|
}
|
||||||
|
return (fromValue * numerator) / denominator;
|
||||||
|
}
|
||||||
|
if (numeratorMultiplierLimit > fromValue) {
|
||||||
|
return (numerator * fromValue) / denominator;
|
||||||
|
}
|
||||||
|
long tmp = numerator / denominator;
|
||||||
|
if ((Long.MAX_VALUE / tmp) < fromValue) {
|
||||||
|
String overflowMsg =
|
||||||
|
"Converting " + fromValue + " from '" + fromUnit + "' to '" + toUnit
|
||||||
|
+ "' will result in an overflow of Long";
|
||||||
|
throw new IllegalArgumentException(overflowMsg);
|
||||||
|
}
|
||||||
|
return fromValue * tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,19 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* This package contains resource utility classes.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.resource;
|
@ -164,6 +164,22 @@ See below screenshot:
|
|||||||
|
|
||||||
![alt text](./images/tensorboard-service.png "Tensorboard service")
|
![alt text](./images/tensorboard-service.png "Tensorboard service")
|
||||||
|
|
||||||
|
If there is no hadoop client, we can also use the java command and the uber jar, hadoop-submarine-all-*.jar, to submit the job.
|
||||||
|
|
||||||
|
```
|
||||||
|
java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \
|
||||||
|
org.apache.hadoop.yarn.submarine.client.cli.Cli job run \
|
||||||
|
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
|
||||||
|
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name tf-job-001 \
|
||||||
|
--docker_image <your-docker-image> \
|
||||||
|
--input_path hdfs://default/dataset/cifar-10-data \
|
||||||
|
--checkpoint_path hdfs://default/tmp/cifar-10-jobdir \
|
||||||
|
--worker_resources memory=4G,vcores=2,gpu=2 \
|
||||||
|
--worker_launch_cmd "python ... (Your training application cmd)" \
|
||||||
|
--tensorboard # this will launch a companion tensorboard container for monitoring
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
### Launch Distributed Tensorflow Application:
|
### Launch Distributed Tensorflow Application:
|
||||||
|
|
||||||
#### Commandline
|
#### Commandline
|
||||||
@ -181,6 +197,20 @@ yarn jar hadoop-yarn-applications-submarine-<version>.jar job run \
|
|||||||
--num_ps 2 \
|
--num_ps 2 \
|
||||||
--ps_resources memory=4G,vcores=2,gpu=0 --ps_launch_cmd "cmd for ps" \
|
--ps_resources memory=4G,vcores=2,gpu=0 --ps_launch_cmd "cmd for ps" \
|
||||||
```
|
```
|
||||||
|
Or
|
||||||
|
```
|
||||||
|
java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \
|
||||||
|
org.apache.hadoop.yarn.submarine.client.cli.Cli job run \
|
||||||
|
--name tf-job-001 --docker_image <your docker image> \
|
||||||
|
--input_path hdfs://default/dataset/cifar-10-data \
|
||||||
|
--checkpoint_path hdfs://default/tmp/cifar-10-jobdir \
|
||||||
|
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
|
||||||
|
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \
|
||||||
|
--num_workers 2 \
|
||||||
|
--worker_resources memory=8G,vcores=2,gpu=1 --worker_launch_cmd "cmd for worker ..." \
|
||||||
|
--num_ps 2 \
|
||||||
|
--ps_resources memory=4G,vcores=2,gpu=0 --ps_launch_cmd "cmd for ps" \
|
||||||
|
```
|
||||||
|
|
||||||
#### Notes:
|
#### Notes:
|
||||||
|
|
||||||
@ -197,7 +227,11 @@ yarn jar hadoop-yarn-applications-submarine-<version>.jar job run \
|
|||||||
```
|
```
|
||||||
yarn jar hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job show --name tf-job-001
|
yarn jar hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job show --name tf-job-001
|
||||||
```
|
```
|
||||||
|
Or
|
||||||
|
```
|
||||||
|
java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \
|
||||||
|
org.apache.hadoop.yarn.submarine.client.cli.Cli job show --name tf-job-001
|
||||||
|
```
|
||||||
Output looks like:
|
Output looks like:
|
||||||
```
|
```
|
||||||
Job Meta Info:
|
Job Meta Info:
|
||||||
@ -222,6 +256,17 @@ yarn jar /tmp/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \
|
|||||||
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current \
|
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current \
|
||||||
--num_workers 0 --tensorboard
|
--num_workers 0 --tensorboard
|
||||||
```
|
```
|
||||||
|
Or
|
||||||
|
```
|
||||||
|
# Cleanup previous service if needed
|
||||||
|
yarn app -destroy tensorboard-service; \
|
||||||
|
java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \
|
||||||
|
org.apache.hadoop.yarn.submarine.client.cli.Cli job run \
|
||||||
|
--name tensorboard-service --verbose --docker_image wtan/tf-1.8.0-cpu:0.0.3 \
|
||||||
|
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
|
||||||
|
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \
|
||||||
|
--num_workers 0 --tensorboard
|
||||||
|
```
|
||||||
|
|
||||||
You can view multiple job training history like from the `Tensorboard` link:
|
You can view multiple job training history like from the `Tensorboard` link:
|
||||||
|
|
||||||
@ -243,3 +288,17 @@ If you want to build the Submarine project by yourself, you can follow the steps
|
|||||||
- Run 'mvn install -DskipTests' from Hadoop source top level once.
|
- Run 'mvn install -DskipTests' from Hadoop source top level once.
|
||||||
|
|
||||||
- Navigate to hadoop-submarine folder and run 'mvn clean package'.
|
- Navigate to hadoop-submarine folder and run 'mvn clean package'.
|
||||||
|
|
||||||
|
- By Default, hadoop-submarine is built based on hadoop 3.1.2 dependencies.
|
||||||
|
Both yarn service runtime and tony runtime are built.
|
||||||
|
You can also add a parameter of "-Phadoop-3.2" to specify the dependencies
|
||||||
|
to hadoop 3.2.0.
|
||||||
|
|
||||||
|
- Hadoop-submarine can support hadoop 2.9.2 and hadoop 2.7.4 as well.
|
||||||
|
You can add "-Phadoop-2.9" to build submarine based on hadoop 2.9.2.
|
||||||
|
For example:
|
||||||
|
```
|
||||||
|
mvn clean package -Phadoop-2.9
|
||||||
|
```
|
||||||
|
As yarn service is based on hadoop 3.*, so only tony runtime is built
|
||||||
|
in this case.
|
||||||
|
@ -16,15 +16,14 @@
|
|||||||
|
|
||||||
package org.apache.hadoop.yarn.submarine.client.cli.runjob;
|
package org.apache.hadoop.yarn.submarine.client.cli.runjob;
|
||||||
|
|
||||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
|
||||||
import org.apache.hadoop.yarn.api.records.ResourceTypeInfo;
|
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils;
|
import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters;
|
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
|
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlParseException;
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlParseException;
|
||||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
@ -32,10 +31,10 @@ import org.junit.BeforeClass;
|
|||||||
import org.junit.Rule;
|
import org.junit.Rule;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.junit.rules.ExpectedException;
|
import org.junit.rules.ExpectedException;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import static org.apache.hadoop.yarn.submarine.client.cli.runjob.TestRunJobCliParsingCommon.getMockClientContext;
|
import static org.apache.hadoop.yarn.submarine.client.cli.runjob.TestRunJobCliParsingCommon.getMockClientContext;
|
||||||
import static org.junit.Assert.assertFalse;
|
import static org.junit.Assert.assertFalse;
|
||||||
@ -49,6 +48,8 @@ public class TestRunJobCliParsingCommonYaml {
|
|||||||
private static final String DIR_NAME = "runjob-common-yaml";
|
private static final String DIR_NAME = "runjob-common-yaml";
|
||||||
private static final String TF_DIR = "runjob-pytorch-yaml";
|
private static final String TF_DIR = "runjob-pytorch-yaml";
|
||||||
private File yamlConfig;
|
private File yamlConfig;
|
||||||
|
private static Logger LOG = LoggerFactory.getLogger(
|
||||||
|
TestRunJobCliParsingCommonYaml.class);
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void before() {
|
public void before() {
|
||||||
@ -62,10 +63,12 @@ public class TestRunJobCliParsingCommonYaml {
|
|||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public static void configureResourceTypes() {
|
public static void configureResourceTypes() {
|
||||||
List<ResourceTypeInfo> resTypes = new ArrayList<>(
|
try {
|
||||||
ResourceUtils.getResourcesTypeInfo());
|
ResourceUtils.configureResourceType(ResourceUtils.GPU_URI);
|
||||||
resTypes.add(ResourceTypeInfo.newInstance(ResourceInformation.GPU_URI, ""));
|
} catch (SubmarineRuntimeException e) {
|
||||||
ResourceUtils.reinitializeResources(resTypes);
|
LOG.info("The hadoop dependency doesn't support gpu resource, " +
|
||||||
|
"so just skip this test case.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Rule
|
@Rule
|
||||||
|
@ -24,29 +24,28 @@ import static org.junit.Assert.assertNull;
|
|||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.api.records.ResourceTypeInfo;
|
|
||||||
import org.apache.hadoop.yarn.resourcetypes.ResourceTypesTestHelper;
|
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils;
|
import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.PyTorchRunJobParameters;
|
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.PyTorchRunJobParameters;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters;
|
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlParseException;
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlParseException;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.runjob.RunJobCli;
|
import org.apache.hadoop.yarn.submarine.client.cli.runjob.RunJobCli;
|
||||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
|
||||||
|
import org.apache.hadoop.yarn.util.resource.Resources;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.BeforeClass;
|
|
||||||
import org.junit.Rule;
|
import org.junit.Rule;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.junit.rules.ExpectedException;
|
import org.junit.rules.ExpectedException;
|
||||||
|
|
||||||
import com.google.common.collect.ImmutableList;
|
import com.google.common.collect.ImmutableList;
|
||||||
import com.google.common.collect.ImmutableMap;
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test class that verifies the correctness of PyTorch
|
* Test class that verifies the correctness of PyTorch
|
||||||
@ -56,6 +55,8 @@ public class TestRunJobCliParsingPyTorchYaml {
|
|||||||
private static final String OVERRIDDEN_PREFIX = "overridden_";
|
private static final String OVERRIDDEN_PREFIX = "overridden_";
|
||||||
private static final String DIR_NAME = "runjob-pytorch-yaml";
|
private static final String DIR_NAME = "runjob-pytorch-yaml";
|
||||||
private File yamlConfig;
|
private File yamlConfig;
|
||||||
|
private static Logger LOG = LoggerFactory.getLogger(
|
||||||
|
TestRunJobCliParsingPyTorchYaml.class);
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void before() {
|
public void before() {
|
||||||
@ -67,14 +68,6 @@ public class TestRunJobCliParsingPyTorchYaml {
|
|||||||
YamlConfigTestUtils.deleteFile(yamlConfig);
|
YamlConfigTestUtils.deleteFile(yamlConfig);
|
||||||
}
|
}
|
||||||
|
|
||||||
@BeforeClass
|
|
||||||
public static void configureResourceTypes() {
|
|
||||||
List<ResourceTypeInfo> resTypes = new ArrayList<>(
|
|
||||||
ResourceUtils.getResourcesTypeInfo());
|
|
||||||
resTypes.add(ResourceTypeInfo.newInstance(ResourceInformation.GPU_URI, ""));
|
|
||||||
ResourceUtils.reinitializeResources(resTypes);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Rule
|
@Rule
|
||||||
public ExpectedException exception = ExpectedException.none();
|
public ExpectedException exception = ExpectedException.none();
|
||||||
|
|
||||||
@ -105,23 +98,38 @@ public class TestRunJobCliParsingPyTorchYaml {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void verifyWorkerValues(RunJobParameters jobRunParameters,
|
private PyTorchRunJobParameters verifyWorkerCommonValues(RunJobParameters
|
||||||
String prefix) {
|
jobRunParameters, String prefix) {
|
||||||
assertTrue(RunJobParameters.class + " must be an instance of " +
|
assertTrue(RunJobParameters.class + " must be an instance of " +
|
||||||
PyTorchRunJobParameters.class,
|
PyTorchRunJobParameters.class,
|
||||||
jobRunParameters instanceof PyTorchRunJobParameters);
|
jobRunParameters instanceof PyTorchRunJobParameters);
|
||||||
PyTorchRunJobParameters tensorFlowParams =
|
PyTorchRunJobParameters pyTorchParams =
|
||||||
(PyTorchRunJobParameters) jobRunParameters;
|
(PyTorchRunJobParameters) jobRunParameters;
|
||||||
|
|
||||||
assertEquals(3, tensorFlowParams.getNumWorkers());
|
assertEquals(3, pyTorchParams.getNumWorkers());
|
||||||
assertEquals(prefix + "testLaunchCmdWorker",
|
assertEquals(prefix + "testLaunchCmdWorker",
|
||||||
tensorFlowParams.getWorkerLaunchCmd());
|
pyTorchParams.getWorkerLaunchCmd());
|
||||||
assertEquals(prefix + "testDockerImageWorker",
|
assertEquals(prefix + "testDockerImageWorker",
|
||||||
tensorFlowParams.getWorkerDockerImage());
|
pyTorchParams.getWorkerDockerImage());
|
||||||
assertEquals(ResourceTypesTestHelper.newResource(20480L, 32,
|
return pyTorchParams;
|
||||||
ImmutableMap.<String, String> builder()
|
}
|
||||||
.put(ResourceInformation.GPU_URI, "2").build()),
|
|
||||||
tensorFlowParams.getWorkerResource());
|
private void verifyWorkerValues(RunJobParameters jobRunParameters,
|
||||||
|
String prefix) {
|
||||||
|
PyTorchRunJobParameters pyTorchParams = verifyWorkerCommonValues
|
||||||
|
(jobRunParameters, prefix);
|
||||||
|
assertEquals(Resources.createResource(20480, 32),
|
||||||
|
pyTorchParams.getWorkerResource());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyWorkerValuesWithGpu(RunJobParameters jobRunParameters,
|
||||||
|
String prefix) {
|
||||||
|
|
||||||
|
PyTorchRunJobParameters pyTorchParams = verifyWorkerCommonValues
|
||||||
|
(jobRunParameters, prefix);
|
||||||
|
Resource workResource = Resources.createResource(20480, 32);
|
||||||
|
ResourceUtils.setResource(workResource, ResourceUtils.GPU_URI, 2);
|
||||||
|
assertEquals(workResource, pyTorchParams.getWorkerResource());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void verifySecurityValues(RunJobParameters jobRunParameters) {
|
private void verifySecurityValues(RunJobParameters jobRunParameters) {
|
||||||
@ -146,6 +154,30 @@ public class TestRunJobCliParsingPyTorchYaml {
|
|||||||
verifySecurityValues(jobRunParameters);
|
verifySecurityValues(jobRunParameters);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testValidGpuYamlParsing() throws Exception {
|
||||||
|
try {
|
||||||
|
ResourceUtils.configureResourceType(ResourceUtils.GPU_URI);
|
||||||
|
} catch (SubmarineRuntimeException e) {
|
||||||
|
LOG.info("The hadoop dependency doesn't support gpu resource, " +
|
||||||
|
"so just skip this test case.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||||
|
DIR_NAME + "/valid-gpu-config.yaml");
|
||||||
|
runJobCli.run(
|
||||||
|
new String[] {"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||||
|
|
||||||
|
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
||||||
|
verifyBasicConfigValues(jobRunParameters);
|
||||||
|
verifyWorkerValuesWithGpu(jobRunParameters, "");
|
||||||
|
verifySecurityValues(jobRunParameters);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRoleOverrides() throws Exception {
|
public void testRoleOverrides() throws Exception {
|
||||||
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
@ -18,26 +18,25 @@
|
|||||||
package org.apache.hadoop.yarn.submarine.client.cli.runjob.tensorflow;
|
package org.apache.hadoop.yarn.submarine.client.cli.runjob.tensorflow;
|
||||||
|
|
||||||
import com.google.common.collect.ImmutableList;
|
import com.google.common.collect.ImmutableList;
|
||||||
import com.google.common.collect.ImmutableMap;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
|
||||||
import org.apache.hadoop.yarn.api.records.ResourceTypeInfo;
|
|
||||||
import org.apache.hadoop.yarn.resourcetypes.ResourceTypesTestHelper;
|
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils;
|
import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters;
|
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
|
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.runjob.RunJobCli;
|
import org.apache.hadoop.yarn.submarine.client.cli.runjob.RunJobCli;
|
||||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
|
||||||
|
import org.apache.hadoop.yarn.util.resource.Resources;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.BeforeClass;
|
|
||||||
import org.junit.Rule;
|
import org.junit.Rule;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.junit.rules.ExpectedException;
|
import org.junit.rules.ExpectedException;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static org.apache.hadoop.yarn.submarine.client.cli.runjob.TestRunJobCliParsingCommon.getMockClientContext;
|
import static org.apache.hadoop.yarn.submarine.client.cli.runjob.TestRunJobCliParsingCommon.getMockClientContext;
|
||||||
@ -53,6 +52,8 @@ public class TestRunJobCliParsingTensorFlowYaml {
|
|||||||
private static final String OVERRIDDEN_PREFIX = "overridden_";
|
private static final String OVERRIDDEN_PREFIX = "overridden_";
|
||||||
private static final String DIR_NAME = "runjob-tensorflow-yaml";
|
private static final String DIR_NAME = "runjob-tensorflow-yaml";
|
||||||
private File yamlConfig;
|
private File yamlConfig;
|
||||||
|
private static Logger LOG = LoggerFactory.getLogger(
|
||||||
|
TestRunJobCliParsingTensorFlowYaml.class);
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void before() {
|
public void before() {
|
||||||
@ -64,14 +65,6 @@ public class TestRunJobCliParsingTensorFlowYaml {
|
|||||||
YamlConfigTestUtils.deleteFile(yamlConfig);
|
YamlConfigTestUtils.deleteFile(yamlConfig);
|
||||||
}
|
}
|
||||||
|
|
||||||
@BeforeClass
|
|
||||||
public static void configureResourceTypes() {
|
|
||||||
List<ResourceTypeInfo> resTypes = new ArrayList<>(
|
|
||||||
ResourceUtils.getResourcesTypeInfo());
|
|
||||||
resTypes.add(ResourceTypeInfo.newInstance(ResourceInformation.GPU_URI, ""));
|
|
||||||
ResourceUtils.reinitializeResources(resTypes);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Rule
|
@Rule
|
||||||
public ExpectedException exception = ExpectedException.none();
|
public ExpectedException exception = ExpectedException.none();
|
||||||
|
|
||||||
@ -114,14 +107,12 @@ public class TestRunJobCliParsingTensorFlowYaml {
|
|||||||
assertEquals(prefix + "testLaunchCmdPs", tensorFlowParams.getPSLaunchCmd());
|
assertEquals(prefix + "testLaunchCmdPs", tensorFlowParams.getPSLaunchCmd());
|
||||||
assertEquals(prefix + "testDockerImagePs",
|
assertEquals(prefix + "testDockerImagePs",
|
||||||
tensorFlowParams.getPsDockerImage());
|
tensorFlowParams.getPsDockerImage());
|
||||||
assertEquals(ResourceTypesTestHelper.newResource(20500L, 34,
|
assertEquals(Resources.createResource(20500, 34),
|
||||||
ImmutableMap.<String, String> builder()
|
|
||||||
.put(ResourceInformation.GPU_URI, "4").build()),
|
|
||||||
tensorFlowParams.getPsResource());
|
tensorFlowParams.getPsResource());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void verifyWorkerValues(RunJobParameters jobRunParameters,
|
private TensorFlowRunJobParameters verifyWorkerCommonValues(
|
||||||
String prefix) {
|
RunJobParameters jobRunParameters, String prefix) {
|
||||||
assertTrue(RunJobParameters.class + " must be an instance of " +
|
assertTrue(RunJobParameters.class + " must be an instance of " +
|
||||||
TensorFlowRunJobParameters.class,
|
TensorFlowRunJobParameters.class,
|
||||||
jobRunParameters instanceof TensorFlowRunJobParameters);
|
jobRunParameters instanceof TensorFlowRunJobParameters);
|
||||||
@ -133,12 +124,26 @@ public class TestRunJobCliParsingTensorFlowYaml {
|
|||||||
tensorFlowParams.getWorkerLaunchCmd());
|
tensorFlowParams.getWorkerLaunchCmd());
|
||||||
assertEquals(prefix + "testDockerImageWorker",
|
assertEquals(prefix + "testDockerImageWorker",
|
||||||
tensorFlowParams.getWorkerDockerImage());
|
tensorFlowParams.getWorkerDockerImage());
|
||||||
assertEquals(ResourceTypesTestHelper.newResource(20480L, 32,
|
return tensorFlowParams;
|
||||||
ImmutableMap.<String, String> builder()
|
}
|
||||||
.put(ResourceInformation.GPU_URI, "2").build()),
|
|
||||||
|
private void verifyWorkerValues(RunJobParameters jobRunParameters,
|
||||||
|
String prefix) {
|
||||||
|
TensorFlowRunJobParameters tensorFlowParams = verifyWorkerCommonValues
|
||||||
|
(jobRunParameters, prefix);
|
||||||
|
assertEquals(Resources.createResource(20480, 32),
|
||||||
tensorFlowParams.getWorkerResource());
|
tensorFlowParams.getWorkerResource());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void verifyWorkerValuesWithGpu(RunJobParameters jobRunParameters,
|
||||||
|
String prefix) {
|
||||||
|
TensorFlowRunJobParameters tensorFlowParams = verifyWorkerCommonValues
|
||||||
|
(jobRunParameters, prefix);
|
||||||
|
Resource workResource = Resources.createResource(20480, 32);
|
||||||
|
ResourceUtils.setResource(workResource, ResourceUtils.GPU_URI, 2);
|
||||||
|
assertEquals(workResource, tensorFlowParams.getWorkerResource());
|
||||||
|
}
|
||||||
|
|
||||||
private void verifySecurityValues(RunJobParameters jobRunParameters) {
|
private void verifySecurityValues(RunJobParameters jobRunParameters) {
|
||||||
assertEquals("keytabPath", jobRunParameters.getKeytab());
|
assertEquals("keytabPath", jobRunParameters.getKeytab());
|
||||||
assertEquals("testPrincipal", jobRunParameters.getPrincipal());
|
assertEquals("testPrincipal", jobRunParameters.getPrincipal());
|
||||||
@ -155,9 +160,7 @@ public class TestRunJobCliParsingTensorFlowYaml {
|
|||||||
assertTrue(tensorFlowParams.isTensorboardEnabled());
|
assertTrue(tensorFlowParams.isTensorboardEnabled());
|
||||||
assertEquals("tensorboardDockerImage",
|
assertEquals("tensorboardDockerImage",
|
||||||
tensorFlowParams.getTensorboardDockerImage());
|
tensorFlowParams.getTensorboardDockerImage());
|
||||||
assertEquals(ResourceTypesTestHelper.newResource(21000L, 37,
|
assertEquals(Resources.createResource(21000, 37),
|
||||||
ImmutableMap.<String, String> builder()
|
|
||||||
.put(ResourceInformation.GPU_URI, "3").build()),
|
|
||||||
tensorFlowParams.getTensorboardResource());
|
tensorFlowParams.getTensorboardResource());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -179,6 +182,32 @@ public class TestRunJobCliParsingTensorFlowYaml {
|
|||||||
verifyTensorboardValues(jobRunParameters);
|
verifyTensorboardValues(jobRunParameters);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testValidGpuYamlParsing() throws Exception {
|
||||||
|
try {
|
||||||
|
ResourceUtils.configureResourceType(ResourceUtils.GPU_URI);
|
||||||
|
} catch (SubmarineRuntimeException e) {
|
||||||
|
LOG.info("The hadoop dependency doesn't support gpu resource, " +
|
||||||
|
"so just skip this test case.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||||
|
DIR_NAME + "/valid-gpu-config.yaml");
|
||||||
|
runJobCli.run(
|
||||||
|
new String[] {"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||||
|
|
||||||
|
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
||||||
|
verifyBasicConfigValues(jobRunParameters);
|
||||||
|
verifyPsValues(jobRunParameters, "");
|
||||||
|
verifyWorkerValuesWithGpu(jobRunParameters, "");
|
||||||
|
verifySecurityValues(jobRunParameters);
|
||||||
|
verifyTensorboardValues(jobRunParameters);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRoleOverrides() throws Exception {
|
public void testRoleOverrides() throws Exception {
|
||||||
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
@ -240,9 +269,7 @@ public class TestRunJobCliParsingTensorFlowYaml {
|
|||||||
assertTrue(tensorFlowParams.isTensorboardEnabled());
|
assertTrue(tensorFlowParams.isTensorboardEnabled());
|
||||||
assertNull("tensorboardDockerImage should be null!",
|
assertNull("tensorboardDockerImage should be null!",
|
||||||
tensorFlowParams.getTensorboardDockerImage());
|
tensorFlowParams.getTensorboardDockerImage());
|
||||||
assertEquals(ResourceTypesTestHelper.newResource(21000L, 37,
|
assertEquals(Resources.createResource(21000, 37),
|
||||||
ImmutableMap.<String, String> builder()
|
|
||||||
.put(ResourceInformation.GPU_URI, "3").build()),
|
|
||||||
tensorFlowParams.getTensorboardResource());
|
tensorFlowParams.getTensorboardResource());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -132,14 +132,14 @@ public class TestRunJobCliParsingTensorFlowYamlStandalone {
|
|||||||
private void assertWorkerValues(Role worker) {
|
private void assertWorkerValues(Role worker) {
|
||||||
assertEquals("testLaunchCmdWorker", worker.getLaunchCmd());
|
assertEquals("testLaunchCmdWorker", worker.getLaunchCmd());
|
||||||
assertEquals("testDockerImageWorker", worker.getDockerImage());
|
assertEquals("testDockerImageWorker", worker.getDockerImage());
|
||||||
assertEquals("memory=20480M,vcores=32,gpu=2", worker.getResources());
|
assertEquals("memory=20480M,vcores=32", worker.getResources());
|
||||||
assertEquals(3, worker.getReplicas());
|
assertEquals(3, worker.getReplicas());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertPsValues(Role ps) {
|
private void assertPsValues(Role ps) {
|
||||||
assertEquals("testLaunchCmdPs", ps.getLaunchCmd());
|
assertEquals("testLaunchCmdPs", ps.getLaunchCmd());
|
||||||
assertEquals("testDockerImagePs", ps.getDockerImage());
|
assertEquals("testDockerImagePs", ps.getDockerImage());
|
||||||
assertEquals("memory=20500M,vcores=34,gpu=4", ps.getResources());
|
assertEquals("memory=20500M,vcores=34", ps.getResources());
|
||||||
assertEquals(4, ps.getReplicas());
|
assertEquals(4, ps.getReplicas());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -161,7 +161,7 @@ public class TestRunJobCliParsingTensorFlowYamlStandalone {
|
|||||||
TensorBoard tensorBoard = yamlConfigFile.getTensorBoard();
|
TensorBoard tensorBoard = yamlConfigFile.getTensorBoard();
|
||||||
assertNotNull("Tensorboard should not be null!", tensorBoard);
|
assertNotNull("Tensorboard should not be null!", tensorBoard);
|
||||||
assertEquals("tensorboardDockerImage", tensorBoard.getDockerImage());
|
assertEquals("tensorboardDockerImage", tensorBoard.getDockerImage());
|
||||||
assertEquals("memory=21000M,vcores=37,gpu=3", tensorBoard.getResources());
|
assertEquals("memory=21000M,vcores=37", tensorBoard.getResources());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
|
@ -20,15 +20,6 @@ package org.apache.hadoop.yarn.submarine.common;
|
|||||||
|
|
||||||
import org.apache.hadoop.yarn.submarine.common.fs.MockRemoteDirectoryManager;
|
import org.apache.hadoop.yarn.submarine.common.fs.MockRemoteDirectoryManager;
|
||||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
import org.apache.hadoop.yarn.client.api.YarnClient;
|
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
|
||||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import static org.junit.Assert.fail;
|
|
||||||
import static org.mockito.Mockito.mock;
|
|
||||||
import static org.mockito.Mockito.when;
|
|
||||||
|
|
||||||
public class MockClientContext extends ClientContext {
|
public class MockClientContext extends ClientContext {
|
||||||
|
|
||||||
@ -44,18 +35,4 @@ public class MockClientContext extends ClientContext {
|
|||||||
RemoteDirectoryManager remoteDirectoryMgr) {
|
RemoteDirectoryManager remoteDirectoryMgr) {
|
||||||
this.remoteDirectoryMgr = remoteDirectoryMgr;
|
this.remoteDirectoryMgr = remoteDirectoryMgr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public synchronized YarnClient getOrCreateYarnClient() {
|
|
||||||
YarnClient client = mock(YarnClient.class);
|
|
||||||
try {
|
|
||||||
when(client.getResourceTypeInfo()).thenReturn(
|
|
||||||
ResourceUtils.getResourcesTypeInfo());
|
|
||||||
} catch (YarnException e) {
|
|
||||||
fail(e.getMessage());
|
|
||||||
} catch (IOException e) {
|
|
||||||
fail(e.getMessage());
|
|
||||||
}
|
|
||||||
return client;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -43,12 +43,12 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
docker_image: testDockerImageWorker
|
docker_image: testDockerImageWorker
|
||||||
ps:
|
ps:
|
||||||
resources: memory=20500M,vcores=34,gpu=4
|
resources: memory=20500M,vcores=34
|
||||||
replicas: 4
|
replicas: 4
|
||||||
launch_cmd: testLaunchCmdPs
|
launch_cmd: testLaunchCmdPs
|
||||||
docker_image: testDockerImagePs
|
docker_image: testDockerImagePs
|
||||||
@ -59,5 +59,5 @@ security:
|
|||||||
distribute_keytab: true
|
distribute_keytab: true
|
||||||
|
|
||||||
tensorBoard:
|
tensorBoard:
|
||||||
resources: memory=21000M,vcores=37,gpu=3
|
resources: memory=21000M,vcores=37
|
||||||
docker_image: tensorboardDockerImage
|
docker_image: tensorboardDockerImage
|
@ -23,11 +23,11 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
ps:
|
ps:
|
||||||
resources: memory=20500M,vcores=34,gpu=4
|
resources: memory=20500M,vcores=34
|
||||||
replicas: 4
|
replicas: 4
|
||||||
launch_cmd: testLaunchCmdPs
|
launch_cmd: testLaunchCmdPs
|
||||||
|
|
||||||
@ -37,5 +37,5 @@ security:
|
|||||||
distribute_keytab: true
|
distribute_keytab: true
|
||||||
|
|
||||||
tensorBoard:
|
tensorBoard:
|
||||||
resources: memory=21000M,vcores=37,gpu=3
|
resources: memory=21000M,vcores=37
|
||||||
docker_image: tensorboardDockerImage
|
docker_image: tensorboardDockerImage
|
@ -42,12 +42,12 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
docker_image: testDockerImageWorker
|
docker_image: testDockerImageWorker
|
||||||
ps:
|
ps:
|
||||||
resources: memory=20500M,vcores=34,gpu=4
|
resources: memory=20500M,vcores=34
|
||||||
replicas: 4
|
replicas: 4
|
||||||
launch_cmd: testLaunchCmdPs
|
launch_cmd: testLaunchCmdPs
|
||||||
docker_image: testDockerImagePs
|
docker_image: testDockerImagePs
|
||||||
@ -58,5 +58,5 @@ security:
|
|||||||
distribute_keytab: true
|
distribute_keytab: true
|
||||||
|
|
||||||
tensorBoard:
|
tensorBoard:
|
||||||
resources: memory=21000M,vcores=37,gpu=3
|
resources: memory=21000M,vcores=37
|
||||||
docker_image: tensorboardDockerImage
|
docker_image: tensorboardDockerImage
|
@ -40,10 +40,10 @@ configs:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
ps:
|
ps:
|
||||||
resources: memory=20500M,vcores=34,gpu=4
|
resources: memory=20500M,vcores=34
|
||||||
replicas: 4
|
replicas: 4
|
||||||
launch_cmd: testLaunchCmdPs
|
launch_cmd: testLaunchCmdPs
|
@ -43,11 +43,11 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
ps:
|
ps:
|
||||||
resources: memory=20500M,vcores=34,gpu=4
|
resources: memory=20500M,vcores=34
|
||||||
replicas: 4
|
replicas: 4
|
||||||
launch_cmd: testLaunchCmdPs
|
launch_cmd: testLaunchCmdPs
|
||||||
|
|
||||||
|
@ -42,11 +42,11 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
ps:
|
ps:
|
||||||
resources: memory=20500M,vcores=34,gpu=4
|
resources: memory=20500M,vcores=34
|
||||||
replicas: 4
|
replicas: 4
|
||||||
launch_cmd: testLaunchCmdPs
|
launch_cmd: testLaunchCmdPs
|
||||||
|
|
||||||
@ -56,5 +56,5 @@ security:
|
|||||||
distribute_keytab: true
|
distribute_keytab: true
|
||||||
|
|
||||||
tensorBoard:
|
tensorBoard:
|
||||||
resources: memory=21000M,vcores=37,gpu=3
|
resources: memory=21000M,vcores=37
|
||||||
docker_image: tensorboardDockerImage
|
docker_image: tensorboardDockerImage
|
@ -42,11 +42,11 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
ps:
|
ps:
|
||||||
resources: memory=20500M,vcores=34,gpu=4
|
resources: memory=20500M,vcores=34
|
||||||
replicas: 4
|
replicas: 4
|
||||||
launch_cmd: testLaunchCmdPs
|
launch_cmd: testLaunchCmdPs
|
||||||
|
|
||||||
@ -56,5 +56,5 @@ security:
|
|||||||
distribute_keytab: true
|
distribute_keytab: true
|
||||||
|
|
||||||
tensorBoard:
|
tensorBoard:
|
||||||
resources: memory=21000M,vcores=37,gpu=3
|
resources: memory=21000M,vcores=37
|
||||||
docker_image: tensorboardDockerImage
|
docker_image: tensorboardDockerImage
|
@ -40,7 +40,7 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
docker_image: testDockerImageWorker
|
docker_image: testDockerImageWorker
|
||||||
|
@ -43,7 +43,7 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
docker_image: testDockerImageWorker
|
docker_image: testDockerImageWorker
|
||||||
|
@ -43,7 +43,7 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
docker_image: testDockerImageWorker
|
docker_image: testDockerImageWorker
|
||||||
|
@ -43,7 +43,7 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
docker_image: testDockerImageWorker
|
docker_image: testDockerImageWorker
|
||||||
|
@ -43,7 +43,7 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: overridden_testLaunchCmdWorker
|
launch_cmd: overridden_testLaunchCmdWorker
|
||||||
docker_image: overridden_testDockerImageWorker
|
docker_image: overridden_testDockerImageWorker
|
||||||
|
@ -43,7 +43,7 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
docker_image: testDockerImageWorker
|
docker_image: testDockerImageWorker
|
||||||
|
@ -0,0 +1,54 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
spec:
|
||||||
|
name: testJobName
|
||||||
|
job_type: testJobType
|
||||||
|
framework: pytorch
|
||||||
|
|
||||||
|
configs:
|
||||||
|
input_path: testInputPath
|
||||||
|
checkpoint_path: testCheckpointPath
|
||||||
|
saved_model_path: testSavedModelPath
|
||||||
|
docker_image: testDockerImage
|
||||||
|
wait_job_finish: true
|
||||||
|
envs:
|
||||||
|
env1: 'env1Value'
|
||||||
|
env2: 'env2Value'
|
||||||
|
localizations:
|
||||||
|
- hdfs://remote-file1:/local-filename1:rw
|
||||||
|
- nfs://remote-file2:/local-filename2:rw
|
||||||
|
mounts:
|
||||||
|
- /etc/passwd:/etc/passwd:rw
|
||||||
|
- /etc/hosts:/etc/hosts:rw
|
||||||
|
quicklinks:
|
||||||
|
- Notebook_UI=https://master-0:7070
|
||||||
|
- Notebook_UI2=https://master-0:7071
|
||||||
|
|
||||||
|
scheduling:
|
||||||
|
queue: queue1
|
||||||
|
|
||||||
|
roles:
|
||||||
|
worker:
|
||||||
|
resources: memory=20480M,vcores=32,gpu=2
|
||||||
|
replicas: 3
|
||||||
|
launch_cmd: testLaunchCmdWorker
|
||||||
|
docker_image: testDockerImageWorker
|
||||||
|
|
||||||
|
security:
|
||||||
|
keytab: keytabPath
|
||||||
|
principal: testPrincipal
|
||||||
|
distribute_keytab: true
|
@ -40,12 +40,12 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
docker_image: testDockerImageWorker
|
docker_image: testDockerImageWorker
|
||||||
ps:
|
ps:
|
||||||
resources: memory=20500M,vcores=34,gpu=4
|
resources: memory=20500M,vcores=34
|
||||||
replicas: 4
|
replicas: 4
|
||||||
launch_cmd: testLaunchCmdPs
|
launch_cmd: testLaunchCmdPs
|
||||||
docker_image: testDockerImagePs
|
docker_image: testDockerImagePs
|
||||||
@ -56,5 +56,5 @@ security:
|
|||||||
distribute_keytab: true
|
distribute_keytab: true
|
||||||
|
|
||||||
tensorBoard:
|
tensorBoard:
|
||||||
resources: memory=21000M,vcores=37,gpu=3
|
resources: memory=21000M,vcores=37
|
||||||
docker_image: tensorboardDockerImage
|
docker_image: tensorboardDockerImage
|
@ -43,12 +43,12 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
docker_image: testDockerImageWorker
|
docker_image: testDockerImageWorker
|
||||||
ps:
|
ps:
|
||||||
resources: memory=20500M,vcores=34,gpu=4
|
resources: memory=20500M,vcores=34
|
||||||
replicas: 4
|
replicas: 4
|
||||||
launch_cmd: testLaunchCmdPs
|
launch_cmd: testLaunchCmdPs
|
||||||
docker_image: testDockerImagePs
|
docker_image: testDockerImagePs
|
||||||
@ -58,5 +58,5 @@ security:
|
|||||||
distribute_keytab: true
|
distribute_keytab: true
|
||||||
|
|
||||||
tensorBoard:
|
tensorBoard:
|
||||||
resources: memory=21000M,vcores=37,gpu=3
|
resources: memory=21000M,vcores=37
|
||||||
docker_image: tensorboardDockerImage
|
docker_image: tensorboardDockerImage
|
@ -43,12 +43,12 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
docker_image: testDockerImageWorker
|
docker_image: testDockerImageWorker
|
||||||
ps:
|
ps:
|
||||||
resources: memory=20500M,vcores=34,gpu=4
|
resources: memory=20500M,vcores=34
|
||||||
replicas: 4
|
replicas: 4
|
||||||
launch_cmd: testLaunchCmdPs
|
launch_cmd: testLaunchCmdPs
|
||||||
docker_image: testDockerImagePs
|
docker_image: testDockerImagePs
|
||||||
@ -59,4 +59,4 @@ security:
|
|||||||
distribute_keytab: true
|
distribute_keytab: true
|
||||||
|
|
||||||
tensorBoard:
|
tensorBoard:
|
||||||
resources: memory=21000M,vcores=37,gpu=3
|
resources: memory=21000M,vcores=37
|
||||||
|
@ -43,7 +43,7 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: overridden_testLaunchCmdWorker
|
launch_cmd: overridden_testLaunchCmdWorker
|
||||||
docker_image: overridden_testDockerImageWorker
|
docker_image: overridden_testDockerImageWorker
|
||||||
@ -58,7 +58,7 @@ roles:
|
|||||||
- /etc/hosts:/overridden_Worker
|
- /etc/hosts:/overridden_Worker
|
||||||
|
|
||||||
ps:
|
ps:
|
||||||
resources: memory=20500M,vcores=34,gpu=4
|
resources: memory=20500M,vcores=34
|
||||||
replicas: 4
|
replicas: 4
|
||||||
launch_cmd: overridden_testLaunchCmdPs
|
launch_cmd: overridden_testLaunchCmdPs
|
||||||
docker_image: overridden_testDockerImagePs
|
docker_image: overridden_testDockerImagePs
|
||||||
@ -78,5 +78,5 @@ security:
|
|||||||
distribute_keytab: true
|
distribute_keytab: true
|
||||||
|
|
||||||
tensorBoard:
|
tensorBoard:
|
||||||
resources: memory=21000M,vcores=37,gpu=3
|
resources: memory=21000M,vcores=37
|
||||||
docker_image: tensorboardDockerImage
|
docker_image: tensorboardDockerImage
|
@ -43,12 +43,12 @@ scheduling:
|
|||||||
|
|
||||||
roles:
|
roles:
|
||||||
worker:
|
worker:
|
||||||
resources: memory=20480M,vcores=32,gpu=2
|
resources: memory=20480M,vcores=32
|
||||||
replicas: 3
|
replicas: 3
|
||||||
launch_cmd: testLaunchCmdWorker
|
launch_cmd: testLaunchCmdWorker
|
||||||
docker_image: testDockerImageWorker
|
docker_image: testDockerImageWorker
|
||||||
ps:
|
ps:
|
||||||
resources: memory=20500M,vcores=34,gpu=4
|
resources: memory=20500M,vcores=34
|
||||||
replicas: 4
|
replicas: 4
|
||||||
launch_cmd: testLaunchCmdPs
|
launch_cmd: testLaunchCmdPs
|
||||||
docker_image: testDockerImagePs
|
docker_image: testDockerImagePs
|
||||||
@ -59,5 +59,5 @@ security:
|
|||||||
distribute_keytab: true
|
distribute_keytab: true
|
||||||
|
|
||||||
tensorBoard:
|
tensorBoard:
|
||||||
resources: memory=21000M,vcores=37,gpu=3
|
resources: memory=21000M,vcores=37
|
||||||
docker_image: tensorboardDockerImage
|
docker_image: tensorboardDockerImage
|
@ -0,0 +1,63 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
spec:
|
||||||
|
name: testJobName
|
||||||
|
job_type: testJobType
|
||||||
|
framework: tensorflow
|
||||||
|
|
||||||
|
configs:
|
||||||
|
input_path: testInputPath
|
||||||
|
checkpoint_path: testCheckpointPath
|
||||||
|
saved_model_path: testSavedModelPath
|
||||||
|
docker_image: testDockerImage
|
||||||
|
wait_job_finish: true
|
||||||
|
envs:
|
||||||
|
env1: 'env1Value'
|
||||||
|
env2: 'env2Value'
|
||||||
|
localizations:
|
||||||
|
- hdfs://remote-file1:/local-filename1:rw
|
||||||
|
- nfs://remote-file2:/local-filename2:rw
|
||||||
|
mounts:
|
||||||
|
- /etc/passwd:/etc/passwd:rw
|
||||||
|
- /etc/hosts:/etc/hosts:rw
|
||||||
|
quicklinks:
|
||||||
|
- Notebook_UI=https://master-0:7070
|
||||||
|
- Notebook_UI2=https://master-0:7071
|
||||||
|
|
||||||
|
scheduling:
|
||||||
|
queue: queue1
|
||||||
|
|
||||||
|
roles:
|
||||||
|
worker:
|
||||||
|
resources: memory=20480M,vcores=32,gpu=2
|
||||||
|
replicas: 3
|
||||||
|
launch_cmd: testLaunchCmdWorker
|
||||||
|
docker_image: testDockerImageWorker
|
||||||
|
ps:
|
||||||
|
resources: memory=20500M,vcores=34
|
||||||
|
replicas: 4
|
||||||
|
launch_cmd: testLaunchCmdPs
|
||||||
|
docker_image: testDockerImagePs
|
||||||
|
|
||||||
|
security:
|
||||||
|
keytab: keytabPath
|
||||||
|
principal: testPrincipal
|
||||||
|
distribute_keytab: true
|
||||||
|
|
||||||
|
tensorBoard:
|
||||||
|
resources: memory=21000M,vcores=37
|
||||||
|
docker_image: tensorboardDockerImage
|
131
hadoop-submarine/hadoop-submarine-dist/pom.xml
Normal file
131
hadoop-submarine/hadoop-submarine-dist/pom.xml
Normal file
@ -0,0 +1,131 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<!--
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. See accompanying LICENSE file.
|
||||||
|
-->
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
|
||||||
|
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<parent>
|
||||||
|
<artifactId>hadoop-submarine</artifactId>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<version>0.2.0-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<artifactId>${project.artifactId}</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<name>Hadoop Submarine Dist</name>
|
||||||
|
<packaging>pom</packaging>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<!-- Needed for generating FindBugs warnings using parent pom -->
|
||||||
|
<yarn.basedir>${project.parent.parent.basedir}</yarn.basedir>
|
||||||
|
<project.artifactId>hadoop-submarine-dist</project.artifactId>
|
||||||
|
<project.version>0.2.0-SNAPSHOT</project.version>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-submarine-core</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<profiles>
|
||||||
|
<profile>
|
||||||
|
<id>hadoop-3.2</id>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<!-- Default profile-->
|
||||||
|
<profile>
|
||||||
|
<id>hadoop-3.1</id>
|
||||||
|
<activation>
|
||||||
|
<activeByDefault>true</activeByDefault>
|
||||||
|
</activation>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>hadoop-2.9</id>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>hadoop-2.7</id>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</profile>
|
||||||
|
</profiles>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-assembly-plugin</artifactId>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>dist</id>
|
||||||
|
<phase>package</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>single</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<finalName>${project.artifactId}-${project.version}-${project.activeProfiles[0].id}</finalName>
|
||||||
|
<appendAssemblyId>false</appendAssemblyId>
|
||||||
|
<attach>false</attach>
|
||||||
|
<descriptors>
|
||||||
|
<descriptor>src/assembly/distribution.xml</descriptor>
|
||||||
|
</descriptors>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
</project>
|
@ -0,0 +1,61 @@
|
|||||||
|
<!--
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. See accompanying LICENSE file.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.0.0 http://maven.apache.org/xsd/assembly-2.0.0.xsd">
|
||||||
|
<id>distribution</id>
|
||||||
|
<formats>
|
||||||
|
<format>dir</format>
|
||||||
|
<format>tar.gz</format>
|
||||||
|
</formats>
|
||||||
|
|
||||||
|
<dependencySets>
|
||||||
|
<dependencySet>
|
||||||
|
<outputDirectory>/</outputDirectory>
|
||||||
|
<includes>
|
||||||
|
<include>org.apache.hadoop:hadoop-submarine-core</include>
|
||||||
|
</includes>
|
||||||
|
</dependencySet>
|
||||||
|
<dependencySet>
|
||||||
|
<outputDirectory>/</outputDirectory>
|
||||||
|
<includes>
|
||||||
|
<include>org.apache.hadoop:hadoop-submarine-tony-runtime</include>
|
||||||
|
</includes>
|
||||||
|
</dependencySet>
|
||||||
|
<dependencySet>
|
||||||
|
<outputDirectory>/</outputDirectory>
|
||||||
|
<includes>
|
||||||
|
<include>org.apache.hadoop:hadoop-submarine-yarnservice-runtime</include>
|
||||||
|
</includes>
|
||||||
|
</dependencySet>
|
||||||
|
</dependencySets>
|
||||||
|
|
||||||
|
<fileSets>
|
||||||
|
<fileSet>
|
||||||
|
<directory>../../</directory>
|
||||||
|
<includes>
|
||||||
|
<include>LICENSE*</include>
|
||||||
|
<include>NOTICE*</include>
|
||||||
|
</includes>
|
||||||
|
</fileSet>
|
||||||
|
<fileSet>
|
||||||
|
<directory>../hadoop-submarine-all/target/</directory>
|
||||||
|
<outputDirectory>/</outputDirectory>
|
||||||
|
<includes>
|
||||||
|
<include>hadoop-submarine-all-${project.version}-*.jar</include>
|
||||||
|
</includes>
|
||||||
|
</fileSet>
|
||||||
|
</fileSets>
|
||||||
|
</assembly>
|
@ -23,6 +23,7 @@
|
|||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||||
|
<name>Hadoop Submarine Tony Runtime</name>
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hadoop</groupId>
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
@ -19,9 +19,8 @@ import com.linkedin.tony.util.Utils;
|
|||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
|
||||||
import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
|
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
|
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@ -52,7 +51,7 @@ public final class TonyUtils {
|
|||||||
tonyConf.setLong(
|
tonyConf.setLong(
|
||||||
TonyConfigurationKeys.getResourceKey(Constants.PS_JOB_NAME,
|
TonyConfigurationKeys.getResourceKey(Constants.PS_JOB_NAME,
|
||||||
Constants.MEMORY),
|
Constants.MEMORY),
|
||||||
parameters.getPsResource().getMemorySize());
|
ResourceUtils.getMemorySize(parameters.getPsResource()));
|
||||||
}
|
}
|
||||||
if (parameters.getWorkerResource() != null) {
|
if (parameters.getWorkerResource() != null) {
|
||||||
tonyConf.setInt(
|
tonyConf.setInt(
|
||||||
@ -62,16 +61,12 @@ public final class TonyUtils {
|
|||||||
tonyConf.setLong(
|
tonyConf.setLong(
|
||||||
TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
|
TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
|
||||||
Constants.MEMORY),
|
Constants.MEMORY),
|
||||||
parameters.getWorkerResource().getMemorySize());
|
ResourceUtils.getMemorySize(parameters.getWorkerResource()));
|
||||||
try {
|
|
||||||
tonyConf.setLong(
|
tonyConf.setLong(
|
||||||
TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
|
TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
|
||||||
Constants.GPUS),
|
Constants.GPUS),
|
||||||
parameters.getWorkerResource()
|
ResourceUtils.getResourceValue(parameters.getWorkerResource(),
|
||||||
.getResourceValue(ResourceInformation.GPU_URI));
|
ResourceUtils.GPU_URI));
|
||||||
} catch (ResourceNotFoundException rnfe) {
|
|
||||||
LOG.error("GPU resources not enabled.");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (parameters.getQueue() != null) {
|
if (parameters.getQueue() != null) {
|
||||||
tonyConf.set(
|
tonyConf.set(
|
||||||
|
@ -22,7 +22,7 @@
|
|||||||
<groupId>org.apache.hadoop</groupId>
|
<groupId>org.apache.hadoop</groupId>
|
||||||
<version>0.2.0-SNAPSHOT</version>
|
<version>0.2.0-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>hadoop-submarine-score-yarnservice-runtime</artifactId>
|
<artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
|
||||||
<version>0.2.0-SNAPSHOT</version>
|
<version>0.2.0-SNAPSHOT</version>
|
||||||
<name>Hadoop Submarine YARN Service Runtime</name>
|
<name>Hadoop Submarine YARN Service Runtime</name>
|
||||||
|
|
||||||
@ -108,12 +108,10 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hadoop</groupId>
|
<groupId>org.apache.hadoop</groupId>
|
||||||
<artifactId>hadoop-yarn-services-api</artifactId>
|
<artifactId>hadoop-yarn-services-api</artifactId>
|
||||||
<version>3.3.0-SNAPSHOT</version>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hadoop</groupId>
|
<groupId>org.apache.hadoop</groupId>
|
||||||
<artifactId>hadoop-yarn-services-core</artifactId>
|
<artifactId>hadoop-yarn-services-core</artifactId>
|
||||||
<version>3.3.0-SNAPSHOT</version>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hadoop</groupId>
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
@ -30,7 +30,6 @@ import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchComma
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
import static org.apache.hadoop.yarn.service.conf.YarnServiceConstants.CONTAINER_STATE_REPORT_AS_SERVICE_STATE;
|
|
||||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.addCommonEnvironments;
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.addCommonEnvironments;
|
||||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getScriptFileName;
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getScriptFileName;
|
||||||
import static org.apache.hadoop.yarn.submarine.utils.DockerUtilities.getDockerArtifact;
|
import static org.apache.hadoop.yarn.submarine.utils.DockerUtilities.getDockerArtifact;
|
||||||
@ -85,8 +84,11 @@ public abstract class AbstractComponent {
|
|||||||
if (role.equals(TensorFlowRole.PRIMARY_WORKER) ||
|
if (role.equals(TensorFlowRole.PRIMARY_WORKER) ||
|
||||||
role.equals(PyTorchRole.PRIMARY_WORKER)) {
|
role.equals(PyTorchRole.PRIMARY_WORKER)) {
|
||||||
component.setNumberOfContainers(1L);
|
component.setNumberOfContainers(1L);
|
||||||
|
// If the dependencies are upgraded to hadoop 3.3.0.
|
||||||
|
// yarn.service.container-state-report-as-service-state can be replaced
|
||||||
|
// with CONTAINER_STATE_REPORT_AS_SERVICE_STATE
|
||||||
component.getConfiguration().setProperty(
|
component.getConfiguration().setProperty(
|
||||||
CONTAINER_STATE_REPORT_AS_SERVICE_STATE, "true");
|
"yarn.service.container-state-report-as-service-state", "true");
|
||||||
} else {
|
} else {
|
||||||
component.setNumberOfContainers(
|
component.setNumberOfContainers(
|
||||||
(long) parameters.getNumWorkers() - 1);
|
(long) parameters.getNumWorkers() - 1);
|
||||||
|
@ -33,7 +33,6 @@ import java.io.IOException;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import static junit.framework.TestCase.assertTrue;
|
import static junit.framework.TestCase.assertTrue;
|
||||||
import static org.apache.hadoop.yarn.service.conf.YarnServiceConstants.CONTAINER_STATE_REPORT_AS_SERVICE_STATE;
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertFalse;
|
import static org.junit.Assert.assertFalse;
|
||||||
import static org.mockito.ArgumentMatchers.any;
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
@ -209,8 +208,11 @@ public class TestTensorFlowWorkerComponent {
|
|||||||
Component component = workerComponent.createComponent();
|
Component component = workerComponent.createComponent();
|
||||||
|
|
||||||
assertEquals(1L, (long) component.getNumberOfContainers());
|
assertEquals(1L, (long) component.getNumberOfContainers());
|
||||||
|
// If the dependencies are upgraded to hadoop 3.3.0.
|
||||||
|
// yarn.service.container-state-report-as-service-state can be replaced
|
||||||
|
// with CONTAINER_STATE_REPORT_AS_SERVICE_STATE
|
||||||
verifyCommons(component, ImmutableMap.of(
|
verifyCommons(component, ImmutableMap.of(
|
||||||
CONTAINER_STATE_REPORT_AS_SERVICE_STATE, "true"));
|
"yarn.service.container-state-report-as-service-state", "true"));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -19,12 +19,17 @@ package org.apache.hadoop.yarn.submarine.utils;
|
|||||||
import com.google.common.collect.ImmutableMap;
|
import com.google.common.collect.ImmutableMap;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
|
import org.apache.hadoop.yarn.LocalConfigurationProvider;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.service.api.records.ResourceInformation;
|
import org.apache.hadoop.yarn.service.api.records.ResourceInformation;
|
||||||
import org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationProvider;
|
|
||||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import static org.junit.Assert.*;
|
import static org.junit.Assert.*;
|
||||||
@ -33,13 +38,48 @@ import static org.junit.Assert.*;
|
|||||||
* This class is to test {@link SubmarineResourceUtils}.
|
* This class is to test {@link SubmarineResourceUtils}.
|
||||||
*/
|
*/
|
||||||
public class TestSubmarineResourceUtils {
|
public class TestSubmarineResourceUtils {
|
||||||
|
/**
|
||||||
|
* With the dependencies of hadoop 3.2.0, Need to create a
|
||||||
|
* CustomResourceTypesConfigurationProvider implementations. If the
|
||||||
|
* dependencies are upgraded to hadoop 3.3.0. It can be replaced by
|
||||||
|
* org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationProvi-
|
||||||
|
* der
|
||||||
|
*/
|
||||||
|
private static class CustomResourceTypesConfigurationProvider
|
||||||
|
extends LocalConfigurationProvider {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public InputStream getConfigurationInputStream(Configuration bootstrapConf,
|
||||||
|
String name) throws YarnException, IOException {
|
||||||
|
if (YarnConfiguration.RESOURCE_TYPES_CONFIGURATION_FILE.equals(name)) {
|
||||||
|
return new ByteArrayInputStream(
|
||||||
|
("<configuration>\n" +
|
||||||
|
" <property>\n" +
|
||||||
|
" <name>yarn.resource-types</name>\n" +
|
||||||
|
" <value>" + CUSTOM_RESOURCE_NAME + "</value>\n" +
|
||||||
|
" </property>\n" +
|
||||||
|
" <property>\n" +
|
||||||
|
" <name>yarn.resource-types.a-custom-resource.units</name>\n"
|
||||||
|
+
|
||||||
|
" <value>G</value>\n" +
|
||||||
|
" </property>\n" +
|
||||||
|
"</configuration>\n").getBytes());
|
||||||
|
} else {
|
||||||
|
return super.getConfigurationInputStream(bootstrapConf, name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static final String CUSTOM_RESOURCE_NAME = "a-custom-resource";
|
private static final String CUSTOM_RESOURCE_NAME = "a-custom-resource";
|
||||||
|
|
||||||
private void initResourceTypes() {
|
private void initResourceTypes() {
|
||||||
CustomResourceTypesConfigurationProvider.initResourceTypes(
|
// If the dependencies are upgraded to hadoop 3.3.0. It can be replaced by
|
||||||
ImmutableMap.<String, String>builder()
|
// org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationPro-
|
||||||
.put(CUSTOM_RESOURCE_NAME, "G")
|
// vider
|
||||||
.build());
|
Configuration configuration = new Configuration();
|
||||||
|
configuration.set(YarnConfiguration.RM_CONFIGURATION_PROVIDER_CLASS,
|
||||||
|
CustomResourceTypesConfigurationProvider.class.getName());
|
||||||
|
ResourceUtils.resetResourceTypes(configuration);
|
||||||
}
|
}
|
||||||
|
|
||||||
@After
|
@After
|
||||||
|
@ -20,7 +20,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>org.apache.hadoop</groupId>
|
<groupId>org.apache.hadoop</groupId>
|
||||||
<artifactId>hadoop-project</artifactId>
|
<artifactId>hadoop-project</artifactId>
|
||||||
<version>3.3.0-SNAPSHOT</version>
|
<version>3.2.0</version>
|
||||||
<relativePath/>
|
<relativePath/>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>hadoop-submarine</artifactId>
|
<artifactId>hadoop-submarine</artifactId>
|
||||||
@ -32,15 +32,111 @@
|
|||||||
<hadoop.common.build.dir>${basedir}/../hadoop-common-project/hadoop-common/target</hadoop.common.build.dir>
|
<hadoop.common.build.dir>${basedir}/../hadoop-common-project/hadoop-common/target</hadoop.common.build.dir>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<!-- Do not add dependencies here, add them to the POM of the leaf module -->
|
|
||||||
|
|
||||||
<modules>
|
<modules>
|
||||||
<module>hadoop-submarine-core</module>
|
<module>hadoop-submarine-core</module>
|
||||||
|
<module>hadoop-submarine-all</module>
|
||||||
|
<module>hadoop-submarine-dist</module>
|
||||||
|
</modules>
|
||||||
|
|
||||||
|
<dependencyManagement>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.mockito</groupId>
|
||||||
|
<artifactId>mockito-core</artifactId>
|
||||||
|
<version>2.23.4</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-services-api</artifactId>
|
||||||
|
<version>${hadoop.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-common</artifactId>
|
||||||
|
<version>${hadoop.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-api</artifactId>
|
||||||
|
<version>${hadoop.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-common</artifactId>
|
||||||
|
<version>${hadoop.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-client</artifactId>
|
||||||
|
<version>${hadoop.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-hdfs</artifactId>
|
||||||
|
<version>${hadoop.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>commons-lang3</artifactId>
|
||||||
|
<version>3.7</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</dependencyManagement>
|
||||||
|
|
||||||
|
<profiles>
|
||||||
|
<profile>
|
||||||
|
<id>hadoop-3.2</id>
|
||||||
|
<properties>
|
||||||
|
<hadoop.version>3.2.0</hadoop.version>
|
||||||
|
</properties>
|
||||||
|
<modules>
|
||||||
<module>hadoop-submarine-yarnservice-runtime</module>
|
<module>hadoop-submarine-yarnservice-runtime</module>
|
||||||
<module>hadoop-submarine-tony-runtime</module>
|
<module>hadoop-submarine-tony-runtime</module>
|
||||||
</modules>
|
</modules>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<!-- Default profile-->
|
||||||
|
<profile>
|
||||||
|
<id>hadoop-3.1</id>
|
||||||
|
<activation>
|
||||||
|
<activeByDefault>true</activeByDefault>
|
||||||
|
</activation>
|
||||||
|
<properties>
|
||||||
|
<hadoop.version>3.1.2</hadoop.version>
|
||||||
|
</properties>
|
||||||
|
<modules>
|
||||||
|
<module>hadoop-submarine-yarnservice-runtime</module>
|
||||||
|
<module>hadoop-submarine-tony-runtime</module>
|
||||||
|
</modules>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>hadoop-2.9</id>
|
||||||
|
<properties>
|
||||||
|
<hadoop.version>2.9.2</hadoop.version>
|
||||||
|
</properties>
|
||||||
|
<modules>
|
||||||
|
<module>hadoop-submarine-tony-runtime</module>
|
||||||
|
</modules>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>hadoop-2.7</id>
|
||||||
|
<properties>
|
||||||
|
<hadoop.version>2.7.3</hadoop.version>
|
||||||
|
</properties>
|
||||||
|
<modules>
|
||||||
|
<module>hadoop-submarine-tony-runtime</module>
|
||||||
|
</modules>
|
||||||
|
</profile>
|
||||||
|
|
||||||
<profiles>
|
|
||||||
<profile>
|
<profile>
|
||||||
<id>clover</id>
|
<id>clover</id>
|
||||||
<activation>
|
<activation>
|
||||||
@ -57,4 +153,5 @@
|
|||||||
</dependencies>
|
</dependencies>
|
||||||
</profile>
|
</profile>
|
||||||
</profiles>
|
</profiles>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user