SUBMARINE-47. Provide an implementation to parse configuration values from a YAML file for submarine run CLI. Contributed by Szilard Nemeth.
This commit is contained in:
parent
e9b859f749
commit
1b9ba0ebb2
@ -63,6 +63,10 @@
|
|||||||
<groupId>org.slf4j</groupId>
|
<groupId>org.slf4j</groupId>
|
||||||
<artifactId>slf4j-api</artifactId>
|
<artifactId>slf4j-api</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.yaml</groupId>
|
||||||
|
<artifactId>snakeyaml</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<!-- Dependencies for Hadoop commons -->
|
<!-- Dependencies for Hadoop commons -->
|
||||||
|
|
||||||
@ -92,6 +96,12 @@
|
|||||||
<artifactId>mockito-core</artifactId>
|
<artifactId>mockito-core</artifactId>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-common</artifactId>
|
||||||
|
<type>test-jar</type>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
|
@ -56,4 +56,5 @@ public class CliConstants {
|
|||||||
public static final String KEYTAB = "keytab";
|
public static final String KEYTAB = "keytab";
|
||||||
public static final String PRINCIPAL = "principal";
|
public static final String PRINCIPAL = "principal";
|
||||||
public static final String DISTRIBUTE_KEYTAB = "distribute_keytab";
|
public static final String DISTRIBUTE_KEYTAB = "distribute_keytab";
|
||||||
|
public static final String YAML_CONFIG = "f";
|
||||||
}
|
}
|
||||||
|
@ -20,9 +20,14 @@
|
|||||||
import org.apache.commons.cli.HelpFormatter;
|
import org.apache.commons.cli.HelpFormatter;
|
||||||
import org.apache.commons.cli.Options;
|
import org.apache.commons.cli.Options;
|
||||||
import org.apache.commons.cli.ParseException;
|
import org.apache.commons.cli.ParseException;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.ParametersHolder;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters.UnderscoreConverterPropertyUtils;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlConfigFile;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlParseException;
|
||||||
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineException;
|
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineException;
|
||||||
import org.apache.hadoop.yarn.submarine.runtimes.common.JobMonitor;
|
import org.apache.hadoop.yarn.submarine.runtimes.common.JobMonitor;
|
||||||
@ -30,7 +35,11 @@
|
|||||||
import org.apache.hadoop.yarn.submarine.runtimes.common.StorageKeyConstants;
|
import org.apache.hadoop.yarn.submarine.runtimes.common.StorageKeyConstants;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.yaml.snakeyaml.Yaml;
|
||||||
|
import org.yaml.snakeyaml.constructor.Constructor;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -38,6 +47,8 @@
|
|||||||
public class RunJobCli extends AbstractCli {
|
public class RunJobCli extends AbstractCli {
|
||||||
private static final Logger LOG =
|
private static final Logger LOG =
|
||||||
LoggerFactory.getLogger(RunJobCli.class);
|
LoggerFactory.getLogger(RunJobCli.class);
|
||||||
|
private static final String YAML_PARSE_FAILED = "Failed to parse " +
|
||||||
|
"YAML config";
|
||||||
|
|
||||||
private Options options;
|
private Options options;
|
||||||
private RunJobParameters parameters = new RunJobParameters();
|
private RunJobParameters parameters = new RunJobParameters();
|
||||||
@ -51,10 +62,10 @@ public RunJobCli(ClientContext cliContext) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public RunJobCli(ClientContext cliContext, JobSubmitter jobSubmitter,
|
RunJobCli(ClientContext cliContext, JobSubmitter jobSubmitter,
|
||||||
JobMonitor jobMonitor) {
|
JobMonitor jobMonitor) {
|
||||||
super(cliContext);
|
super(cliContext);
|
||||||
options = generateOptions();
|
this.options = generateOptions();
|
||||||
this.jobSubmitter = jobSubmitter;
|
this.jobSubmitter = jobSubmitter;
|
||||||
this.jobMonitor = jobMonitor;
|
this.jobMonitor = jobMonitor;
|
||||||
}
|
}
|
||||||
@ -65,6 +76,8 @@ public void printUsages() {
|
|||||||
|
|
||||||
private Options generateOptions() {
|
private Options generateOptions() {
|
||||||
Options options = new Options();
|
Options options = new Options();
|
||||||
|
options.addOption(CliConstants.YAML_CONFIG, true,
|
||||||
|
"Config file (in YAML format)");
|
||||||
options.addOption(CliConstants.NAME, true, "Name of the job");
|
options.addOption(CliConstants.NAME, true, "Name of the job");
|
||||||
options.addOption(CliConstants.INPUT_PATH, true,
|
options.addOption(CliConstants.INPUT_PATH, true,
|
||||||
"Input of the job, could be local or other FS directory");
|
"Input of the job, could be local or other FS directory");
|
||||||
@ -77,7 +90,7 @@ private Options generateOptions() {
|
|||||||
+ "exported model is not placed under ${checkpoint_path}"
|
+ "exported model is not placed under ${checkpoint_path}"
|
||||||
+ "could be local or other FS directory. This will be used to serve.");
|
+ "could be local or other FS directory. This will be used to serve.");
|
||||||
options.addOption(CliConstants.N_WORKERS, true,
|
options.addOption(CliConstants.N_WORKERS, true,
|
||||||
"Numnber of worker tasks of the job, by default it's 1");
|
"Number of worker tasks of the job, by default it's 1");
|
||||||
options.addOption(CliConstants.N_PS, true,
|
options.addOption(CliConstants.N_PS, true,
|
||||||
"Number of PS tasks of the job, by default it's 0");
|
"Number of PS tasks of the job, by default it's 0");
|
||||||
options.addOption(CliConstants.WORKER_RES, true,
|
options.addOption(CliConstants.WORKER_RES, true,
|
||||||
@ -119,7 +132,7 @@ private Options generateOptions() {
|
|||||||
+ "uses --" + CliConstants.DOCKER_IMAGE + " as default.");
|
+ "uses --" + CliConstants.DOCKER_IMAGE + " as default.");
|
||||||
options.addOption(CliConstants.QUICKLINK, true, "Specify quicklink so YARN"
|
options.addOption(CliConstants.QUICKLINK, true, "Specify quicklink so YARN"
|
||||||
+ "web UI shows link to given role instance and port. When "
|
+ "web UI shows link to given role instance and port. When "
|
||||||
+ "--tensorboard is speciied, quicklink to tensorboard instance will "
|
+ "--tensorboard is specified, quicklink to tensorboard instance will "
|
||||||
+ "be added automatically. The format of quick link is: "
|
+ "be added automatically. The format of quick link is: "
|
||||||
+ "Quick_link_label=http(or https)://role-name:port. For example, "
|
+ "Quick_link_label=http(or https)://role-name:port. For example, "
|
||||||
+ "if want to link to first worker's 7070 port, and text of quicklink "
|
+ "if want to link to first worker's 7070 port, and text of quicklink "
|
||||||
@ -149,7 +162,7 @@ private Options generateOptions() {
|
|||||||
"by the job under security environment");
|
"by the job under security environment");
|
||||||
options.addOption(CliConstants.DISTRIBUTE_KEYTAB, false, "Distribute " +
|
options.addOption(CliConstants.DISTRIBUTE_KEYTAB, false, "Distribute " +
|
||||||
"local keytab to cluster machines for service authentication. If not " +
|
"local keytab to cluster machines for service authentication. If not " +
|
||||||
"sepcified, pre-destributed keytab of which path specified by" +
|
"specified, pre-distributed keytab of which path specified by" +
|
||||||
" parameter" + CliConstants.KEYTAB + " on cluster machines will be " +
|
" parameter" + CliConstants.KEYTAB + " on cluster machines will be " +
|
||||||
"used");
|
"used");
|
||||||
options.addOption("h", "help", false, "Print help");
|
options.addOption("h", "help", false, "Print help");
|
||||||
@ -180,10 +193,10 @@ private void parseCommandLineAndGetRunJobParameters(String[] args)
|
|||||||
// Do parsing
|
// Do parsing
|
||||||
GnuParser parser = new GnuParser();
|
GnuParser parser = new GnuParser();
|
||||||
CommandLine cli = parser.parse(options, args);
|
CommandLine cli = parser.parse(options, args);
|
||||||
parameters.updateParametersByParsedCommandline(cli, options,
|
ParametersHolder parametersHolder = createParametersHolder(cli);
|
||||||
clientContext);
|
parameters.updateParameters(parametersHolder, clientContext);
|
||||||
} catch (ParseException e) {
|
} catch (ParseException e) {
|
||||||
LOG.error("Exception in parse:", e.getMessage());
|
LOG.error("Exception in parse: {}", e.getMessage());
|
||||||
printUsages();
|
printUsages();
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
@ -195,6 +208,51 @@ private void parseCommandLineAndGetRunJobParameters(String[] args)
|
|||||||
replacePatternsInParameters();
|
replacePatternsInParameters();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private ParametersHolder createParametersHolder(CommandLine cli) {
|
||||||
|
String yamlConfigFile =
|
||||||
|
cli.getOptionValue(CliConstants.YAML_CONFIG);
|
||||||
|
if (yamlConfigFile != null) {
|
||||||
|
YamlConfigFile yamlConfig = readYamlConfigFile(yamlConfigFile);
|
||||||
|
if (yamlConfig == null) {
|
||||||
|
throw new YamlParseException(String.format(
|
||||||
|
YAML_PARSE_FAILED + ", file is empty: %s", yamlConfigFile));
|
||||||
|
} else if (yamlConfig.getConfigs() == null) {
|
||||||
|
throw new YamlParseException(String.format(YAML_PARSE_FAILED +
|
||||||
|
", config section should be defined, but it cannot be found in " +
|
||||||
|
"YAML file '%s'!", yamlConfigFile));
|
||||||
|
}
|
||||||
|
LOG.info("Using YAML configuration!");
|
||||||
|
return ParametersHolder.createWithCmdLineAndYaml(cli, yamlConfig);
|
||||||
|
} else {
|
||||||
|
LOG.info("Using CLI configuration!");
|
||||||
|
return ParametersHolder.createWithCmdLine(cli);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private YamlConfigFile readYamlConfigFile(String filename) {
|
||||||
|
Constructor constructor = new Constructor(YamlConfigFile.class);
|
||||||
|
constructor.setPropertyUtils(new UnderscoreConverterPropertyUtils());
|
||||||
|
try {
|
||||||
|
LOG.info("Reading YAML configuration from file: {}", filename);
|
||||||
|
Yaml yaml = new Yaml(constructor);
|
||||||
|
return yaml.loadAs(FileUtils.openInputStream(new File(filename)),
|
||||||
|
YamlConfigFile.class);
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
logExceptionOfYamlParse(filename, e);
|
||||||
|
throw new YamlParseException(YAML_PARSE_FAILED +
|
||||||
|
", file does not exist!");
|
||||||
|
} catch (Exception e) {
|
||||||
|
logExceptionOfYamlParse(filename, e);
|
||||||
|
throw new YamlParseException(
|
||||||
|
String.format(YAML_PARSE_FAILED + ", details: %s", e.getMessage()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void logExceptionOfYamlParse(String filename, Exception e) {
|
||||||
|
LOG.error(String.format("Exception while parsing YAML file %s", filename),
|
||||||
|
e);
|
||||||
|
}
|
||||||
|
|
||||||
private void setDefaultDirs() throws IOException {
|
private void setDefaultDirs() throws IOException {
|
||||||
// Create directories if needed
|
// Create directories if needed
|
||||||
String jobDir = parameters.getCheckpointPath();
|
String jobDir = parameters.getCheckpointPath();
|
||||||
@ -248,8 +306,7 @@ private void storeJobInformation(String jobName, ApplicationId applicationId,
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int run(String[] args)
|
public int run(String[] args)
|
||||||
throws ParseException, IOException, YarnException, InterruptedException,
|
throws ParseException, IOException, YarnException, SubmarineException {
|
||||||
SubmarineException {
|
|
||||||
if (CliUtils.argsForHelp(args)) {
|
if (CliUtils.argsForHelp(args)) {
|
||||||
printUsages();
|
printUsages();
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -21,6 +21,7 @@
|
|||||||
import org.apache.commons.cli.Options;
|
import org.apache.commons.cli.Options;
|
||||||
import org.apache.commons.cli.ParseException;
|
import org.apache.commons.cli.ParseException;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.ParametersHolder;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.param.ShowJobParameters;
|
import org.apache.hadoop.yarn.submarine.client.cli.param.ShowJobParameters;
|
||||||
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineException;
|
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineException;
|
||||||
@ -61,8 +62,9 @@ private void parseCommandLineAndGetShowJobParameters(String[] args)
|
|||||||
CommandLine cli;
|
CommandLine cli;
|
||||||
try {
|
try {
|
||||||
cli = parser.parse(options, args);
|
cli = parser.parse(options, args);
|
||||||
parameters.updateParametersByParsedCommandline(cli, options,
|
ParametersHolder parametersHolder = ParametersHolder
|
||||||
clientContext);
|
.createWithCmdLine(cli);
|
||||||
|
parameters.updateParameters(parametersHolder, clientContext);
|
||||||
} catch (ParseException e) {
|
} catch (ParseException e) {
|
||||||
printUsages();
|
printUsages();
|
||||||
}
|
}
|
||||||
@ -117,7 +119,6 @@ public int run(String[] args)
|
|||||||
printUsages();
|
printUsages();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
parseCommandLineAndGetShowJobParameters(args);
|
parseCommandLineAndGetShowJobParameters(args);
|
||||||
getAndPrintJobInfo();
|
getAndPrintJobInfo();
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -14,8 +14,6 @@
|
|||||||
|
|
||||||
package org.apache.hadoop.yarn.submarine.client.cli.param;
|
package org.apache.hadoop.yarn.submarine.client.cli.param;
|
||||||
|
|
||||||
import org.apache.commons.cli.CommandLine;
|
|
||||||
import org.apache.commons.cli.Options;
|
|
||||||
import org.apache.commons.cli.ParseException;
|
import org.apache.commons.cli.ParseException;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.CliConstants;
|
import org.apache.hadoop.yarn.submarine.client.cli.CliConstants;
|
||||||
@ -30,15 +28,15 @@
|
|||||||
public abstract class BaseParameters {
|
public abstract class BaseParameters {
|
||||||
private String name;
|
private String name;
|
||||||
|
|
||||||
public void updateParametersByParsedCommandline(CommandLine parsedCommandLine,
|
public void updateParameters(ParametersHolder parametersHolder,
|
||||||
Options options, ClientContext clientContext)
|
ClientContext clientContext)
|
||||||
throws ParseException, IOException, YarnException {
|
throws ParseException, IOException, YarnException {
|
||||||
String name = parsedCommandLine.getOptionValue(CliConstants.NAME);
|
String name = parametersHolder.getOptionValue(CliConstants.NAME);
|
||||||
if (name == null) {
|
if (name == null) {
|
||||||
throw new ParseException("--name is absent");
|
throw new ParseException("--name is absent");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (parsedCommandLine.hasOption(CliConstants.VERBOSE)) {
|
if (parametersHolder.hasOption(CliConstants.VERBOSE)) {
|
||||||
SubmarineLogs.verboseOn();
|
SubmarineLogs.verboseOn();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,315 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableSet;
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
import org.apache.commons.cli.CommandLine;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.CliConstants;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.Configs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.Role;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.Roles;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.Scheduling;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.Security;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.TensorBoard;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlConfigFile;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class acts as a wrapper of {@code CommandLine} values along with
|
||||||
|
* YAML configuration values.
|
||||||
|
* YAML configuration is only stored if the -f <filename>
|
||||||
|
* option is specified along the CLI arguments.
|
||||||
|
* Using this wrapper class makes easy to deal with
|
||||||
|
* any form of configuration source potentially added into Submarine,
|
||||||
|
* in the future.
|
||||||
|
* If both YAML and CLI value is found for a config, this is an error case.
|
||||||
|
*/
|
||||||
|
public final class ParametersHolder {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(ParametersHolder.class);
|
||||||
|
|
||||||
|
private final CommandLine parsedCommandLine;
|
||||||
|
private final Map<String, String> yamlStringConfigs;
|
||||||
|
private final Map<String, List<String>> yamlListConfigs;
|
||||||
|
private final ImmutableSet onlyDefinedWithCliArgs = ImmutableSet.of(
|
||||||
|
CliConstants.VERBOSE);
|
||||||
|
|
||||||
|
private ParametersHolder(CommandLine parsedCommandLine,
|
||||||
|
YamlConfigFile yamlConfig) {
|
||||||
|
this.parsedCommandLine = parsedCommandLine;
|
||||||
|
this.yamlStringConfigs = initStringConfigValues(yamlConfig);
|
||||||
|
this.yamlListConfigs = initListConfigValues(yamlConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maps every value coming from the passed yamlConfig to {@code CliConstants}.
|
||||||
|
* @param yamlConfig Parsed YAML config
|
||||||
|
* @return A map of config values, keys are {@code CliConstants}
|
||||||
|
* and values are Strings.
|
||||||
|
*/
|
||||||
|
private Map<String, String> initStringConfigValues(
|
||||||
|
YamlConfigFile yamlConfig) {
|
||||||
|
if (yamlConfig == null) {
|
||||||
|
return Collections.emptyMap();
|
||||||
|
}
|
||||||
|
Map<String, String> yamlConfigValues = Maps.newHashMap();
|
||||||
|
Roles roles = yamlConfig.getRoles();
|
||||||
|
|
||||||
|
initGenericConfigs(yamlConfig, yamlConfigValues);
|
||||||
|
initPs(yamlConfigValues, roles.getPs());
|
||||||
|
initWorker(yamlConfigValues, roles.getWorker());
|
||||||
|
initScheduling(yamlConfigValues, yamlConfig.getScheduling());
|
||||||
|
initSecurity(yamlConfigValues, yamlConfig.getSecurity());
|
||||||
|
initTensorBoard(yamlConfigValues, yamlConfig.getTensorBoard());
|
||||||
|
|
||||||
|
return yamlConfigValues;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, List<String>> initListConfigValues(
|
||||||
|
YamlConfigFile yamlConfig) {
|
||||||
|
if (yamlConfig == null) {
|
||||||
|
return Collections.emptyMap();
|
||||||
|
}
|
||||||
|
|
||||||
|
Map<String, List<String>> yamlConfigValues = Maps.newHashMap();
|
||||||
|
Configs configs = yamlConfig.getConfigs();
|
||||||
|
yamlConfigValues.put(CliConstants.LOCALIZATION, configs.getLocalizations());
|
||||||
|
yamlConfigValues.put(CliConstants.ENV,
|
||||||
|
convertToEnvsList(configs.getEnvs()));
|
||||||
|
yamlConfigValues.put(CliConstants.QUICKLINK, configs.getQuicklinks());
|
||||||
|
|
||||||
|
return yamlConfigValues;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initGenericConfigs(YamlConfigFile yamlConfig,
|
||||||
|
Map<String, String> yamlConfigs) {
|
||||||
|
yamlConfigs.put(CliConstants.NAME, yamlConfig.getSpec().getName());
|
||||||
|
|
||||||
|
Configs configs = yamlConfig.getConfigs();
|
||||||
|
yamlConfigs.put(CliConstants.INPUT_PATH, configs.getInputPath());
|
||||||
|
yamlConfigs.put(CliConstants.CHECKPOINT_PATH, configs.getCheckpointPath());
|
||||||
|
yamlConfigs.put(CliConstants.SAVED_MODEL_PATH, configs.getSavedModelPath());
|
||||||
|
yamlConfigs.put(CliConstants.DOCKER_IMAGE, configs.getDockerImage());
|
||||||
|
yamlConfigs.put(CliConstants.WAIT_JOB_FINISH, configs.getWaitJobFinish());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initPs(Map<String, String> yamlConfigs, Role ps) {
|
||||||
|
if (ps == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
yamlConfigs.put(CliConstants.N_PS, String.valueOf(ps.getReplicas()));
|
||||||
|
yamlConfigs.put(CliConstants.PS_RES, ps.getResources());
|
||||||
|
yamlConfigs.put(CliConstants.PS_DOCKER_IMAGE, ps.getDockerImage());
|
||||||
|
yamlConfigs.put(CliConstants.PS_LAUNCH_CMD, ps.getLaunchCmd());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initWorker(Map<String, String> yamlConfigs, Role worker) {
|
||||||
|
if (worker == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
yamlConfigs.put(CliConstants.N_WORKERS,
|
||||||
|
String.valueOf(worker.getReplicas()));
|
||||||
|
yamlConfigs.put(CliConstants.WORKER_RES, worker.getResources());
|
||||||
|
yamlConfigs.put(CliConstants.WORKER_DOCKER_IMAGE, worker.getDockerImage());
|
||||||
|
yamlConfigs.put(CliConstants.WORKER_LAUNCH_CMD, worker.getLaunchCmd());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initScheduling(Map<String, String> yamlConfigValues,
|
||||||
|
Scheduling scheduling) {
|
||||||
|
if (scheduling == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
yamlConfigValues.put(CliConstants.QUEUE, scheduling.getQueue());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initSecurity(Map<String, String> yamlConfigValues,
|
||||||
|
Security security) {
|
||||||
|
if (security == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
yamlConfigValues.put(CliConstants.KEYTAB, security.getKeytab());
|
||||||
|
yamlConfigValues.put(CliConstants.PRINCIPAL, security.getPrincipal());
|
||||||
|
yamlConfigValues.put(CliConstants.DISTRIBUTE_KEYTAB,
|
||||||
|
String.valueOf(security.isDistributeKeytab()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initTensorBoard(Map<String, String> yamlConfigValues,
|
||||||
|
TensorBoard tensorBoard) {
|
||||||
|
if (tensorBoard == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
yamlConfigValues.put(CliConstants.TENSORBOARD, Boolean.TRUE.toString());
|
||||||
|
yamlConfigValues.put(CliConstants.TENSORBOARD_DOCKER_IMAGE,
|
||||||
|
tensorBoard.getDockerImage());
|
||||||
|
yamlConfigValues.put(CliConstants.TENSORBOARD_RESOURCES,
|
||||||
|
tensorBoard.getResources());
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> convertToEnvsList(Map<String, String> envs) {
|
||||||
|
if (envs == null) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
return envs.entrySet().stream()
|
||||||
|
.map(e -> String.format("%s=%s", e.getKey(), e.getValue()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ParametersHolder createWithCmdLine(CommandLine cli) {
|
||||||
|
return new ParametersHolder(cli, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ParametersHolder createWithCmdLineAndYaml(CommandLine cli,
|
||||||
|
YamlConfigFile yamlConfig) {
|
||||||
|
return new ParametersHolder(cli, yamlConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the option value, either from the CLI arguments or YAML config,
|
||||||
|
* if present.
|
||||||
|
* @param option Name of the config.
|
||||||
|
* @return The value of the config
|
||||||
|
*/
|
||||||
|
String getOptionValue(String option) throws YarnException {
|
||||||
|
ensureConfigIsDefinedOnce(option, true);
|
||||||
|
if (onlyDefinedWithCliArgs.contains(option) ||
|
||||||
|
parsedCommandLine.hasOption(option)) {
|
||||||
|
return getValueFromCLI(option);
|
||||||
|
}
|
||||||
|
return getValueFromYaml(option);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the option values, either from the CLI arguments or YAML config,
|
||||||
|
* if present.
|
||||||
|
* @param option Name of the config.
|
||||||
|
* @return The values of the config
|
||||||
|
*/
|
||||||
|
List<String> getOptionValues(String option) throws YarnException {
|
||||||
|
ensureConfigIsDefinedOnce(option, false);
|
||||||
|
if (onlyDefinedWithCliArgs.contains(option) ||
|
||||||
|
parsedCommandLine.hasOption(option)) {
|
||||||
|
return getValuesFromCLI(option);
|
||||||
|
}
|
||||||
|
return getValuesFromYaml(option);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void ensureConfigIsDefinedOnce(String option, boolean stringValue)
|
||||||
|
throws YarnException {
|
||||||
|
boolean definedWithYaml;
|
||||||
|
if (stringValue) {
|
||||||
|
definedWithYaml = yamlStringConfigs.containsKey(option);
|
||||||
|
} else {
|
||||||
|
definedWithYaml = yamlListConfigs.containsKey(option);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parsedCommandLine.hasOption(option) && definedWithYaml) {
|
||||||
|
throw new YarnException("Config '%s' is defined both with YAML config" +
|
||||||
|
" and with CLI argument, please only use either way!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getValueFromCLI(String option) {
|
||||||
|
String value = parsedCommandLine.getOptionValue(option);
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("Found config value {} for key {} " +
|
||||||
|
"from CLI configuration.", value, option);
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> getValuesFromCLI(String option) {
|
||||||
|
String[] optionValues = parsedCommandLine.getOptionValues(option);
|
||||||
|
if (optionValues != null) {
|
||||||
|
List<String> values = Arrays.asList(optionValues);
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("Found config values {} for key {} " +
|
||||||
|
"from CLI configuration.", values, option);
|
||||||
|
}
|
||||||
|
return values;
|
||||||
|
} else {
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("No config values found for key {} " +
|
||||||
|
"from CLI configuration.", option);
|
||||||
|
}
|
||||||
|
return Lists.newArrayList();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getValueFromYaml(String option) {
|
||||||
|
String value = yamlStringConfigs.get(option);
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("Found config value {} for key {} " +
|
||||||
|
"from YAML configuration.", value, option);
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> getValuesFromYaml(String option) {
|
||||||
|
List<String> values = yamlListConfigs.get(option);
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("Found config values {} for key {} " +
|
||||||
|
"from YAML configuration.", values, option);
|
||||||
|
}
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the boolean value of option.
|
||||||
|
* First, we check if the CLI value is defined for the option.
|
||||||
|
* If not, then we check the YAML value.
|
||||||
|
* @param option name of the option
|
||||||
|
* @return true, if the option is found in the CLI args or in the YAML config,
|
||||||
|
* false otherwise.
|
||||||
|
*/
|
||||||
|
boolean hasOption(String option) {
|
||||||
|
if (onlyDefinedWithCliArgs.contains(option)) {
|
||||||
|
boolean value = parsedCommandLine.hasOption(option);
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("Found boolean config with value {} for key {} " +
|
||||||
|
"from CLI configuration.", value, option);
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
if (parsedCommandLine.hasOption(option)) {
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("Found boolean config value for key {} " +
|
||||||
|
"from CLI configuration.", option);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return getBooleanValueFromYaml(option);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean getBooleanValueFromYaml(String option) {
|
||||||
|
String stringValue = yamlStringConfigs.get(option);
|
||||||
|
boolean result = stringValue != null
|
||||||
|
&& Boolean.valueOf(stringValue).equals(Boolean.TRUE);
|
||||||
|
LOG.debug("Found config value {} for key {} " +
|
||||||
|
"from YAML configuration.", result, option);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
@ -14,8 +14,8 @@
|
|||||||
|
|
||||||
package org.apache.hadoop.yarn.submarine.client.cli.param;
|
package org.apache.hadoop.yarn.submarine.client.cli.param;
|
||||||
|
|
||||||
import org.apache.commons.cli.CommandLine;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import org.apache.commons.cli.Options;
|
import com.google.common.base.CaseFormat;
|
||||||
import org.apache.commons.cli.ParseException;
|
import org.apache.commons.cli.ParseException;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
@ -23,7 +23,10 @@
|
|||||||
import org.apache.hadoop.yarn.submarine.client.cli.CliUtils;
|
import org.apache.hadoop.yarn.submarine.client.cli.CliUtils;
|
||||||
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||||
|
import org.yaml.snakeyaml.introspector.Property;
|
||||||
|
import org.yaml.snakeyaml.introspector.PropertyUtils;
|
||||||
|
|
||||||
|
import java.beans.IntrospectionException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -58,28 +61,31 @@ public class RunJobParameters extends RunParameters {
|
|||||||
private boolean distributeKeytab = false;
|
private boolean distributeKeytab = false;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void updateParametersByParsedCommandline(CommandLine parsedCommandLine,
|
public void updateParameters(ParametersHolder parametersHolder,
|
||||||
Options options, ClientContext clientContext)
|
ClientContext clientContext)
|
||||||
throws ParseException, IOException, YarnException {
|
throws ParseException, IOException, YarnException {
|
||||||
|
|
||||||
String input = parsedCommandLine.getOptionValue(CliConstants.INPUT_PATH);
|
String input = parametersHolder.getOptionValue(CliConstants.INPUT_PATH);
|
||||||
String jobDir = parsedCommandLine.getOptionValue(CliConstants.CHECKPOINT_PATH);
|
String jobDir = parametersHolder.getOptionValue(
|
||||||
|
CliConstants.CHECKPOINT_PATH);
|
||||||
int nWorkers = 1;
|
int nWorkers = 1;
|
||||||
if (parsedCommandLine.getOptionValue(CliConstants.N_WORKERS) != null) {
|
if (parametersHolder.getOptionValue(CliConstants.N_WORKERS) != null) {
|
||||||
nWorkers = Integer.parseInt(
|
nWorkers = Integer.parseInt(
|
||||||
parsedCommandLine.getOptionValue(CliConstants.N_WORKERS));
|
parametersHolder.getOptionValue(CliConstants.N_WORKERS));
|
||||||
// Only check null value.
|
// Only check null value.
|
||||||
// Training job shouldn't ignore INPUT_PATH option
|
// Training job shouldn't ignore INPUT_PATH option
|
||||||
// But if nWorkers is 0, INPUT_PATH can be ignored because user can only run Tensorboard
|
// But if nWorkers is 0, INPUT_PATH can be ignored because
|
||||||
|
// user can only run Tensorboard
|
||||||
if (null == input && 0 != nWorkers) {
|
if (null == input && 0 != nWorkers) {
|
||||||
throw new ParseException("\"--" + CliConstants.INPUT_PATH + "\" is absent");
|
throw new ParseException("\"--" + CliConstants.INPUT_PATH +
|
||||||
|
"\" is absent");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int nPS = 0;
|
int nPS = 0;
|
||||||
if (parsedCommandLine.getOptionValue(CliConstants.N_PS) != null) {
|
if (parametersHolder.getOptionValue(CliConstants.N_PS) != null) {
|
||||||
nPS = Integer.parseInt(
|
nPS = Integer.parseInt(
|
||||||
parsedCommandLine.getOptionValue(CliConstants.N_PS));
|
parametersHolder.getOptionValue(CliConstants.N_PS));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check #workers and #ps.
|
// Check #workers and #ps.
|
||||||
@ -91,15 +97,15 @@ public void updateParametersByParsedCommandline(CommandLine parsedCommandLine,
|
|||||||
+ "please double check.");
|
+ "please double check.");
|
||||||
}
|
}
|
||||||
|
|
||||||
String kerberosKeytab = parsedCommandLine.getOptionValue(
|
String kerberosKeytab = parametersHolder.getOptionValue(
|
||||||
CliConstants.KEYTAB);
|
CliConstants.KEYTAB);
|
||||||
String kerberosPrincipal = parsedCommandLine.getOptionValue(
|
String kerberosPrincipal = parametersHolder.getOptionValue(
|
||||||
CliConstants.PRINCIPAL);
|
CliConstants.PRINCIPAL);
|
||||||
CliUtils.doLoginIfSecure(kerberosKeytab, kerberosPrincipal);
|
CliUtils.doLoginIfSecure(kerberosKeytab, kerberosPrincipal);
|
||||||
|
|
||||||
workerResource = null;
|
workerResource = null;
|
||||||
if (nWorkers > 0) {
|
if (nWorkers > 0) {
|
||||||
String workerResourceStr = parsedCommandLine.getOptionValue(
|
String workerResourceStr = parametersHolder.getOptionValue(
|
||||||
CliConstants.WORKER_RES);
|
CliConstants.WORKER_RES);
|
||||||
if (workerResourceStr == null) {
|
if (workerResourceStr == null) {
|
||||||
throw new ParseException(
|
throw new ParseException(
|
||||||
@ -112,7 +118,8 @@ public void updateParametersByParsedCommandline(CommandLine parsedCommandLine,
|
|||||||
|
|
||||||
Resource psResource = null;
|
Resource psResource = null;
|
||||||
if (nPS > 0) {
|
if (nPS > 0) {
|
||||||
String psResourceStr = parsedCommandLine.getOptionValue(CliConstants.PS_RES);
|
String psResourceStr = parametersHolder.getOptionValue(
|
||||||
|
CliConstants.PS_RES);
|
||||||
if (psResourceStr == null) {
|
if (psResourceStr == null) {
|
||||||
throw new ParseException("--" + CliConstants.PS_RES + " is absent.");
|
throw new ParseException("--" + CliConstants.PS_RES + " is absent.");
|
||||||
}
|
}
|
||||||
@ -121,9 +128,9 @@ public void updateParametersByParsedCommandline(CommandLine parsedCommandLine,
|
|||||||
}
|
}
|
||||||
|
|
||||||
boolean tensorboard = false;
|
boolean tensorboard = false;
|
||||||
if (parsedCommandLine.hasOption(CliConstants.TENSORBOARD)) {
|
if (parametersHolder.hasOption(CliConstants.TENSORBOARD)) {
|
||||||
tensorboard = true;
|
tensorboard = true;
|
||||||
String tensorboardResourceStr = parsedCommandLine.getOptionValue(
|
String tensorboardResourceStr = parametersHolder.getOptionValue(
|
||||||
CliConstants.TENSORBOARD_RESOURCES);
|
CliConstants.TENSORBOARD_RESOURCES);
|
||||||
if (tensorboardResourceStr == null || tensorboardResourceStr.isEmpty()) {
|
if (tensorboardResourceStr == null || tensorboardResourceStr.isEmpty()) {
|
||||||
tensorboardResourceStr = CliConstants.TENSORBOARD_DEFAULT_RESOURCES;
|
tensorboardResourceStr = CliConstants.TENSORBOARD_DEFAULT_RESOURCES;
|
||||||
@ -131,17 +138,17 @@ public void updateParametersByParsedCommandline(CommandLine parsedCommandLine,
|
|||||||
tensorboardResource = ResourceUtils.createResourceFromString(
|
tensorboardResource = ResourceUtils.createResourceFromString(
|
||||||
tensorboardResourceStr,
|
tensorboardResourceStr,
|
||||||
clientContext.getOrCreateYarnClient().getResourceTypeInfo());
|
clientContext.getOrCreateYarnClient().getResourceTypeInfo());
|
||||||
tensorboardDockerImage = parsedCommandLine.getOptionValue(
|
tensorboardDockerImage = parametersHolder.getOptionValue(
|
||||||
CliConstants.TENSORBOARD_DOCKER_IMAGE);
|
CliConstants.TENSORBOARD_DOCKER_IMAGE);
|
||||||
this.setTensorboardResource(tensorboardResource);
|
this.setTensorboardResource(tensorboardResource);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (parsedCommandLine.hasOption(CliConstants.WAIT_JOB_FINISH)) {
|
if (parametersHolder.hasOption(CliConstants.WAIT_JOB_FINISH)) {
|
||||||
this.waitJobFinish = true;
|
this.waitJobFinish = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Quicklinks
|
// Quicklinks
|
||||||
String[] quicklinkStrs = parsedCommandLine.getOptionValues(
|
List<String> quicklinkStrs = parametersHolder.getOptionValues(
|
||||||
CliConstants.QUICKLINK);
|
CliConstants.QUICKLINK);
|
||||||
if (quicklinkStrs != null) {
|
if (quicklinkStrs != null) {
|
||||||
for (String ql : quicklinkStrs) {
|
for (String ql : quicklinkStrs) {
|
||||||
@ -151,18 +158,18 @@ public void updateParametersByParsedCommandline(CommandLine parsedCommandLine,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
psDockerImage = parsedCommandLine.getOptionValue(
|
psDockerImage = parametersHolder.getOptionValue(
|
||||||
CliConstants.PS_DOCKER_IMAGE);
|
CliConstants.PS_DOCKER_IMAGE);
|
||||||
workerDockerImage = parsedCommandLine.getOptionValue(
|
workerDockerImage = parametersHolder.getOptionValue(
|
||||||
CliConstants.WORKER_DOCKER_IMAGE);
|
CliConstants.WORKER_DOCKER_IMAGE);
|
||||||
|
|
||||||
String workerLaunchCmd = parsedCommandLine.getOptionValue(
|
String workerLaunchCmd = parametersHolder.getOptionValue(
|
||||||
CliConstants.WORKER_LAUNCH_CMD);
|
CliConstants.WORKER_LAUNCH_CMD);
|
||||||
String psLaunchCommand = parsedCommandLine.getOptionValue(
|
String psLaunchCommand = parametersHolder.getOptionValue(
|
||||||
CliConstants.PS_LAUNCH_CMD);
|
CliConstants.PS_LAUNCH_CMD);
|
||||||
|
|
||||||
// Localizations
|
// Localizations
|
||||||
String[] localizationsStr = parsedCommandLine.getOptionValues(
|
List<String> localizationsStr = parametersHolder.getOptionValues(
|
||||||
CliConstants.LOCALIZATION);
|
CliConstants.LOCALIZATION);
|
||||||
if (null != localizationsStr) {
|
if (null != localizationsStr) {
|
||||||
for (String loc : localizationsStr) {
|
for (String loc : localizationsStr) {
|
||||||
@ -171,10 +178,11 @@ public void updateParametersByParsedCommandline(CommandLine parsedCommandLine,
|
|||||||
localizations.add(localization);
|
localizations.add(localization);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
boolean distributeKerberosKeytab = parsedCommandLine.hasOption(CliConstants
|
boolean distributeKerberosKeytab = parametersHolder.hasOption(CliConstants
|
||||||
.DISTRIBUTE_KEYTAB);
|
.DISTRIBUTE_KEYTAB);
|
||||||
|
|
||||||
this.setInputPath(input).setCheckpointPath(jobDir).setNumPS(nPS).setNumWorkers(nWorkers)
|
this.setInputPath(input).setCheckpointPath(jobDir)
|
||||||
|
.setNumPS(nPS).setNumWorkers(nWorkers)
|
||||||
.setPSLaunchCmd(psLaunchCommand).setWorkerLaunchCmd(workerLaunchCmd)
|
.setPSLaunchCmd(psLaunchCommand).setWorkerLaunchCmd(workerLaunchCmd)
|
||||||
.setPsResource(psResource)
|
.setPsResource(psResource)
|
||||||
.setTensorboardEnabled(tensorboard)
|
.setTensorboardEnabled(tensorboard)
|
||||||
@ -182,8 +190,7 @@ public void updateParametersByParsedCommandline(CommandLine parsedCommandLine,
|
|||||||
.setPrincipal(kerberosPrincipal)
|
.setPrincipal(kerberosPrincipal)
|
||||||
.setDistributeKeytab(distributeKerberosKeytab);
|
.setDistributeKeytab(distributeKerberosKeytab);
|
||||||
|
|
||||||
super.updateParametersByParsedCommandline(parsedCommandLine,
|
super.updateParameters(parametersHolder, clientContext);
|
||||||
options, clientContext);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getInputPath() {
|
public String getInputPath() {
|
||||||
@ -331,4 +338,20 @@ public RunJobParameters setDistributeKeytab(
|
|||||||
this.distributeKeytab = distributeKerberosKeytab;
|
this.distributeKeytab = distributeKerberosKeytab;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public static class UnderscoreConverterPropertyUtils extends PropertyUtils {
|
||||||
|
@Override
|
||||||
|
public Property getProperty(Class<? extends Object> type, String name)
|
||||||
|
throws IntrospectionException {
|
||||||
|
if (name.indexOf('_') > -1) {
|
||||||
|
name = convertName(name);
|
||||||
|
}
|
||||||
|
return super.getProperty(type, name);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String convertName(String name) {
|
||||||
|
return CaseFormat.UPPER_UNDERSCORE.to(CaseFormat.LOWER_CAMEL, name);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -14,8 +14,6 @@
|
|||||||
|
|
||||||
package org.apache.hadoop.yarn.submarine.client.cli.param;
|
package org.apache.hadoop.yarn.submarine.client.cli.param;
|
||||||
|
|
||||||
import org.apache.commons.cli.CommandLine;
|
|
||||||
import org.apache.commons.cli.Options;
|
|
||||||
import org.apache.commons.cli.ParseException;
|
import org.apache.commons.cli.ParseException;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.CliConstants;
|
import org.apache.hadoop.yarn.submarine.client.cli.CliConstants;
|
||||||
@ -35,33 +33,36 @@ public abstract class RunParameters extends BaseParameters {
|
|||||||
private String queue;
|
private String queue;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void updateParametersByParsedCommandline(CommandLine parsedCommandLine,
|
public void updateParameters(ParametersHolder parametersHolder,
|
||||||
Options options, ClientContext clientContext) throws ParseException,
|
ClientContext clientContext) throws ParseException,
|
||||||
IOException, YarnException {
|
IOException, YarnException {
|
||||||
String savedModelPath = parsedCommandLine.getOptionValue(
|
String savedModelPath = parametersHolder.getOptionValue(
|
||||||
CliConstants.SAVED_MODEL_PATH);
|
CliConstants.SAVED_MODEL_PATH);
|
||||||
this.setSavedModelPath(savedModelPath);
|
this.setSavedModelPath(savedModelPath);
|
||||||
|
|
||||||
// Envars
|
List<String> envVars = getEnvVars(parametersHolder);
|
||||||
List<String> envarsList = new ArrayList<>();
|
this.setEnvars(envVars);
|
||||||
String[] envars = parsedCommandLine.getOptionValues(CliConstants.ENV);
|
|
||||||
if (envars != null) {
|
|
||||||
for (String envar : envars) {
|
|
||||||
envarsList.add(envar);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
this.setEnvars(envarsList);
|
|
||||||
|
|
||||||
String queue = parsedCommandLine.getOptionValue(
|
String queue = parametersHolder.getOptionValue(
|
||||||
CliConstants.QUEUE);
|
CliConstants.QUEUE);
|
||||||
this.setQueue(queue);
|
this.setQueue(queue);
|
||||||
|
|
||||||
String dockerImage = parsedCommandLine.getOptionValue(
|
String dockerImage = parametersHolder.getOptionValue(
|
||||||
CliConstants.DOCKER_IMAGE);
|
CliConstants.DOCKER_IMAGE);
|
||||||
this.setDockerImageName(dockerImage);
|
this.setDockerImageName(dockerImage);
|
||||||
|
|
||||||
super.updateParametersByParsedCommandline(parsedCommandLine,
|
super.updateParameters(parametersHolder, clientContext);
|
||||||
options, clientContext);
|
}
|
||||||
|
|
||||||
|
private List<String> getEnvVars(ParametersHolder parametersHolder)
|
||||||
|
throws YarnException {
|
||||||
|
List<String> result = new ArrayList<>();
|
||||||
|
List<String> envVarsArray = parametersHolder.getOptionValues(
|
||||||
|
CliConstants.ENV);
|
||||||
|
if (envVarsArray != null) {
|
||||||
|
result.addAll(envVarsArray);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getQueue() {
|
public String getQueue() {
|
||||||
|
@ -0,0 +1,107 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param.yaml;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class that holds values found in 'configs' section of YAML configuration.
|
||||||
|
*/
|
||||||
|
public class Configs {
|
||||||
|
private String dockerImage;
|
||||||
|
private String inputPath;
|
||||||
|
private String savedModelPath;
|
||||||
|
private String checkpointPath;
|
||||||
|
private List<String> quicklinks;
|
||||||
|
private String waitJobFinish;
|
||||||
|
private Map<String, String> envs;
|
||||||
|
private List<String> localizations;
|
||||||
|
private List<String> mounts;
|
||||||
|
|
||||||
|
public String getDockerImage() {
|
||||||
|
return dockerImage;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDockerImage(String dockerImage) {
|
||||||
|
this.dockerImage = dockerImage;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getInputPath() {
|
||||||
|
return inputPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setInputPath(String inputPath) {
|
||||||
|
this.inputPath = inputPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSavedModelPath() {
|
||||||
|
return savedModelPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSavedModelPath(String savedModelPath) {
|
||||||
|
this.savedModelPath = savedModelPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getCheckpointPath() {
|
||||||
|
return checkpointPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCheckpointPath(String checkpointPath) {
|
||||||
|
this.checkpointPath = checkpointPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, String> getEnvs() {
|
||||||
|
return envs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEnvs(Map<String, String> envs) {
|
||||||
|
this.envs = envs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getLocalizations() {
|
||||||
|
return localizations;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLocalizations(List<String> localizations) {
|
||||||
|
this.localizations = localizations;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getMounts() {
|
||||||
|
return mounts;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMounts(List<String> mounts) {
|
||||||
|
this.mounts = mounts;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getQuicklinks() {
|
||||||
|
return quicklinks;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setQuicklinks(List<String> quicklinks) {
|
||||||
|
this.quicklinks = quicklinks;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getWaitJobFinish() {
|
||||||
|
return waitJobFinish;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setWaitJobFinish(String waitJobFinish) {
|
||||||
|
this.waitJobFinish = waitJobFinish;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,25 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param.yaml;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Holds configuration values for PS (parameter server).
|
||||||
|
* 'ps' is a section underneath the 'roles' section of the YAML
|
||||||
|
* configuration file.
|
||||||
|
*/
|
||||||
|
public class PsRole extends Role {
|
||||||
|
}
|
@ -0,0 +1,91 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param.yaml;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base class for Roles. 'roles' is a section of the YAML configuration file.
|
||||||
|
*/
|
||||||
|
public class Role {
|
||||||
|
private String resources;
|
||||||
|
private int replicas;
|
||||||
|
private String launchCmd;
|
||||||
|
|
||||||
|
//Optional parameters (Can override global config)
|
||||||
|
private String dockerImage;
|
||||||
|
private Map<String, String> envs;
|
||||||
|
private List<String> localizations;
|
||||||
|
private List<String> mounts;
|
||||||
|
|
||||||
|
public String getResources() {
|
||||||
|
return resources;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setResources(String resources) {
|
||||||
|
this.resources = resources;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getReplicas() {
|
||||||
|
return replicas;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setReplicas(int replicas) {
|
||||||
|
this.replicas = replicas;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getLaunchCmd() {
|
||||||
|
return launchCmd;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLaunchCmd(String launchCmd) {
|
||||||
|
this.launchCmd = launchCmd;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDockerImage() {
|
||||||
|
return dockerImage;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDockerImage(String dockerImage) {
|
||||||
|
this.dockerImage = dockerImage;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, String> getEnvs() {
|
||||||
|
return envs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEnvs(Map<String, String> envs) {
|
||||||
|
this.envs = envs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getLocalizations() {
|
||||||
|
return localizations;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLocalizations(List<String> localizations) {
|
||||||
|
this.localizations = localizations;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getMounts() {
|
||||||
|
return mounts;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMounts(List<String> mounts) {
|
||||||
|
this.mounts = mounts;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,41 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param.yaml;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class represents a section of the YAML configuration file.
|
||||||
|
*/
|
||||||
|
public class Roles {
|
||||||
|
private Role worker;
|
||||||
|
private Role ps;
|
||||||
|
|
||||||
|
public Role getWorker() {
|
||||||
|
return worker;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setWorker(Role worker) {
|
||||||
|
this.worker = worker;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Role getPs() {
|
||||||
|
return ps;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPs(Role ps) {
|
||||||
|
this.ps = ps;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,32 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param.yaml;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class that holds values found in 'scheduling' section of YAML configuration.
|
||||||
|
*/
|
||||||
|
public class Scheduling {
|
||||||
|
private String queue;
|
||||||
|
|
||||||
|
public String getQueue() {
|
||||||
|
return queue;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setQueue(String queue) {
|
||||||
|
this.queue = queue;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,50 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param.yaml;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class that holds values found in 'security' section of YAML configuration.
|
||||||
|
*/
|
||||||
|
public class Security {
|
||||||
|
private String keytab;
|
||||||
|
private String principal;
|
||||||
|
private boolean distributeKeytab;
|
||||||
|
|
||||||
|
public String getKeytab() {
|
||||||
|
return keytab;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setKeytab(String keytab) {
|
||||||
|
this.keytab = keytab;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getPrincipal() {
|
||||||
|
return principal;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPrincipal(String principal) {
|
||||||
|
this.principal = principal;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isDistributeKeytab() {
|
||||||
|
return distributeKeytab;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDistributeKeytab(boolean distributeKeytab) {
|
||||||
|
this.distributeKeytab = distributeKeytab;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,41 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param.yaml;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class that holds values found in 'spec' section of YAML configuration.
|
||||||
|
*/
|
||||||
|
public class Spec {
|
||||||
|
private String name;
|
||||||
|
private String jobType;
|
||||||
|
|
||||||
|
public String getJobType() {
|
||||||
|
return jobType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setJobType(String jobtype) {
|
||||||
|
this.jobType = jobtype;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setName(String name) {
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,41 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param.yaml;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class that holds values found in 'tensorboard' section of YAML configuration.
|
||||||
|
*/
|
||||||
|
public class TensorBoard {
|
||||||
|
private String dockerImage;
|
||||||
|
private String resources;
|
||||||
|
|
||||||
|
public String getDockerImage() {
|
||||||
|
return dockerImage;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDockerImage(String dockerImage) {
|
||||||
|
this.dockerImage = dockerImage;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getResources() {
|
||||||
|
return resources;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setResources(String resources) {
|
||||||
|
this.resources = resources;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,25 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param.yaml;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Holds configuration values for the worker role.
|
||||||
|
* 'worker' is a section underneath the 'roles' section of the YAML
|
||||||
|
* configuration file.
|
||||||
|
*/
|
||||||
|
public class WorkerRole extends Role {
|
||||||
|
}
|
@ -0,0 +1,77 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param.yaml;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Root class of YAML configuration.
|
||||||
|
*/
|
||||||
|
public class YamlConfigFile {
|
||||||
|
private Spec spec;
|
||||||
|
private Configs configs;
|
||||||
|
private Roles roles;
|
||||||
|
private Scheduling scheduling;
|
||||||
|
private Security security;
|
||||||
|
private TensorBoard tensorBoard;
|
||||||
|
|
||||||
|
public Spec getSpec() {
|
||||||
|
return spec;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSpec(Spec spec) {
|
||||||
|
this.spec = spec;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Configs getConfigs() {
|
||||||
|
return configs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setConfigs(Configs configs) {
|
||||||
|
this.configs = configs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Roles getRoles() {
|
||||||
|
return roles;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRoles(Roles roles) {
|
||||||
|
this.roles = roles;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Scheduling getScheduling() {
|
||||||
|
return scheduling;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setScheduling(Scheduling scheduling) {
|
||||||
|
this.scheduling = scheduling;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Security getSecurity() {
|
||||||
|
return security;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSecurity(Security security) {
|
||||||
|
this.security = security;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TensorBoard getTensorBoard() {
|
||||||
|
return tensorBoard;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTensorBoard(TensorBoard tensorBoard) {
|
||||||
|
this.tensorBoard = tensorBoard;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,27 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param.yaml;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This exception is thrown if any issue arises while parsing the
|
||||||
|
* YAML configuration.
|
||||||
|
*/
|
||||||
|
public class YamlParseException extends RuntimeException {
|
||||||
|
public YamlParseException(String message) {
|
||||||
|
super(message);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,19 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* This package contains value classes for the YAML parser.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param.yaml;
|
@ -36,11 +36,13 @@
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.mockito.ArgumentMatchers.any;
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
import static org.mockito.Mockito.mock;
|
import static org.mockito.Mockito.mock;
|
||||||
import static org.mockito.Mockito.when;
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
public class TestRunJobCliParsing {
|
public class TestRunJobCliParsing {
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void before() {
|
public void before() {
|
||||||
SubmarineLogs.verboseOff();
|
SubmarineLogs.verboseOff();
|
||||||
@ -56,7 +58,7 @@ public void testPrintHelp() {
|
|||||||
runJobCli.printUsages();
|
runJobCli.printUsages();
|
||||||
}
|
}
|
||||||
|
|
||||||
private MockClientContext getMockClientContext()
|
static MockClientContext getMockClientContext()
|
||||||
throws IOException, YarnException {
|
throws IOException, YarnException {
|
||||||
MockClientContext mockClientContext = new MockClientContext();
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
JobSubmitter mockJobSubmitter = mock(JobSubmitter.class);
|
JobSubmitter mockJobSubmitter = mock(JobSubmitter.class);
|
||||||
@ -92,21 +94,21 @@ public void testBasicRunJobForDistributedTraining() throws Exception {
|
|||||||
|
|
||||||
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
||||||
|
|
||||||
Assert.assertEquals(jobRunParameters.getInputPath(), "hdfs://input");
|
assertEquals(jobRunParameters.getInputPath(), "hdfs://input");
|
||||||
Assert.assertEquals(jobRunParameters.getCheckpointPath(), "hdfs://output");
|
assertEquals(jobRunParameters.getCheckpointPath(), "hdfs://output");
|
||||||
Assert.assertEquals(jobRunParameters.getNumPS(), 2);
|
assertEquals(jobRunParameters.getNumPS(), 2);
|
||||||
Assert.assertEquals(jobRunParameters.getPSLaunchCmd(), "python run-ps.py");
|
assertEquals(jobRunParameters.getPSLaunchCmd(), "python run-ps.py");
|
||||||
Assert.assertEquals(Resources.createResource(4096, 4),
|
assertEquals(Resources.createResource(4096, 4),
|
||||||
jobRunParameters.getPsResource());
|
jobRunParameters.getPsResource());
|
||||||
Assert.assertEquals(jobRunParameters.getWorkerLaunchCmd(),
|
assertEquals(jobRunParameters.getWorkerLaunchCmd(),
|
||||||
"python run-job.py");
|
"python run-job.py");
|
||||||
Assert.assertEquals(Resources.createResource(2048, 2),
|
assertEquals(Resources.createResource(2048, 2),
|
||||||
jobRunParameters.getWorkerResource());
|
jobRunParameters.getWorkerResource());
|
||||||
Assert.assertEquals(jobRunParameters.getDockerImageName(),
|
assertEquals(jobRunParameters.getDockerImageName(),
|
||||||
"tf-docker:1.1.0");
|
"tf-docker:1.1.0");
|
||||||
Assert.assertEquals(jobRunParameters.getKeytab(),
|
assertEquals(jobRunParameters.getKeytab(),
|
||||||
"/keytab/path");
|
"/keytab/path");
|
||||||
Assert.assertEquals(jobRunParameters.getPrincipal(),
|
assertEquals(jobRunParameters.getPrincipal(),
|
||||||
"user/_HOST@domain.com");
|
"user/_HOST@domain.com");
|
||||||
Assert.assertTrue(jobRunParameters.isDistributeKeytab());
|
Assert.assertTrue(jobRunParameters.isDistributeKeytab());
|
||||||
Assert.assertTrue(SubmarineLogs.isVerbose());
|
Assert.assertTrue(SubmarineLogs.isVerbose());
|
||||||
@ -126,12 +128,12 @@ public void testBasicRunJobForSingleNodeTraining() throws Exception {
|
|||||||
|
|
||||||
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
||||||
|
|
||||||
Assert.assertEquals(jobRunParameters.getInputPath(), "hdfs://input");
|
assertEquals(jobRunParameters.getInputPath(), "hdfs://input");
|
||||||
Assert.assertEquals(jobRunParameters.getCheckpointPath(), "hdfs://output");
|
assertEquals(jobRunParameters.getCheckpointPath(), "hdfs://output");
|
||||||
Assert.assertEquals(jobRunParameters.getNumWorkers(), 1);
|
assertEquals(jobRunParameters.getNumWorkers(), 1);
|
||||||
Assert.assertEquals(jobRunParameters.getWorkerLaunchCmd(),
|
assertEquals(jobRunParameters.getWorkerLaunchCmd(),
|
||||||
"python run-job.py");
|
"python run-job.py");
|
||||||
Assert.assertEquals(Resources.createResource(4096, 2),
|
assertEquals(Resources.createResource(4096, 2),
|
||||||
jobRunParameters.getWorkerResource());
|
jobRunParameters.getWorkerResource());
|
||||||
Assert.assertTrue(SubmarineLogs.isVerbose());
|
Assert.assertTrue(SubmarineLogs.isVerbose());
|
||||||
Assert.assertTrue(jobRunParameters.isWaitJobFinish());
|
Assert.assertTrue(jobRunParameters.isWaitJobFinish());
|
||||||
@ -153,7 +155,7 @@ public void testNoInputPathOptionSpecified() throws Exception {
|
|||||||
actualMessage = e.getMessage();
|
actualMessage = e.getMessage();
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
Assert.assertEquals(expectedErrorMessage, actualMessage);
|
assertEquals(expectedErrorMessage, actualMessage);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -182,19 +184,23 @@ public void testLaunchCommandPatternReplace() throws Exception {
|
|||||||
|
|
||||||
runJobCli.run(
|
runJobCli.run(
|
||||||
new String[] { "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
|
new String[] { "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
|
||||||
"--input_path", "hdfs://input", "--checkpoint_path", "hdfs://output",
|
"--input_path", "hdfs://input", "--checkpoint_path",
|
||||||
|
"hdfs://output",
|
||||||
"--num_workers", "3", "--num_ps", "2", "--worker_launch_cmd",
|
"--num_workers", "3", "--num_ps", "2", "--worker_launch_cmd",
|
||||||
"python run-job.py --input=%input_path% --model_dir=%checkpoint_path% --export_dir=%saved_model_path%/savedmodel",
|
"python run-job.py --input=%input_path% " +
|
||||||
|
"--model_dir=%checkpoint_path% " +
|
||||||
|
"--export_dir=%saved_model_path%/savedmodel",
|
||||||
"--worker_resources", "memory=2048,vcores=2", "--ps_resources",
|
"--worker_resources", "memory=2048,vcores=2", "--ps_resources",
|
||||||
"memory=4096,vcores=4", "--tensorboard", "true", "--ps_launch_cmd",
|
"memory=4096,vcores=4", "--tensorboard", "true", "--ps_launch_cmd",
|
||||||
"python run-ps.py --input=%input_path% --model_dir=%checkpoint_path%/model",
|
"python run-ps.py --input=%input_path% " +
|
||||||
|
"--model_dir=%checkpoint_path%/model",
|
||||||
"--verbose" });
|
"--verbose" });
|
||||||
|
|
||||||
Assert.assertEquals(
|
assertEquals(
|
||||||
"python run-job.py --input=hdfs://input --model_dir=hdfs://output "
|
"python run-job.py --input=hdfs://input --model_dir=hdfs://output "
|
||||||
+ "--export_dir=hdfs://output/savedmodel",
|
+ "--export_dir=hdfs://output/savedmodel",
|
||||||
runJobCli.getRunJobParameters().getWorkerLaunchCmd());
|
runJobCli.getRunJobParameters().getWorkerLaunchCmd());
|
||||||
Assert.assertEquals(
|
assertEquals(
|
||||||
"python run-ps.py --input=hdfs://input --model_dir=hdfs://output/model",
|
"python run-ps.py --input=hdfs://input --model_dir=hdfs://output/model",
|
||||||
runJobCli.getRunJobParameters().getPSLaunchCmd());
|
runJobCli.getRunJobParameters().getPSLaunchCmd());
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,380 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableList;
|
||||||
|
import com.google.common.collect.ImmutableMap;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ResourceTypeInfo;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.resourcetypes.ResourceTypesTestHelper;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlParseException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
|
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Rule;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.rules.ExpectedException;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.submarine.client.cli.TestRunJobCliParsing.getMockClientContext;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
import static org.junit.Assert.assertNull;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test class that verifies the correctness of YAML configuration parsing.
|
||||||
|
*/
|
||||||
|
public class TestRunJobCliParsingYaml {
|
||||||
|
private static final String OVERRIDDEN_PREFIX = "overridden_";
|
||||||
|
private static final String DIR_NAME = "runjobcliparsing";
|
||||||
|
private File yamlConfig;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void before() {
|
||||||
|
SubmarineLogs.verboseOff();
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void after() {
|
||||||
|
YamlConfigTestUtils.deleteFile(yamlConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void configureResourceTypes() {
|
||||||
|
List<ResourceTypeInfo> resTypes = new ArrayList<>(
|
||||||
|
ResourceUtils.getResourcesTypeInfo());
|
||||||
|
resTypes.add(ResourceTypeInfo.newInstance(ResourceInformation.GPU_URI, ""));
|
||||||
|
ResourceUtils.reinitializeResources(resTypes);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Rule
|
||||||
|
public ExpectedException exception = ExpectedException.none();
|
||||||
|
|
||||||
|
private void verifyBasicConfigValues(RunJobParameters jobRunParameters) {
|
||||||
|
verifyBasicConfigValues(jobRunParameters,
|
||||||
|
ImmutableList.of("env1=env1Value", "env2=env2Value"));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyBasicConfigValues(RunJobParameters jobRunParameters,
|
||||||
|
List<String> expectedEnvs) {
|
||||||
|
assertEquals("testInputPath", jobRunParameters.getInputPath());
|
||||||
|
assertEquals("testCheckpointPath", jobRunParameters.getCheckpointPath());
|
||||||
|
assertEquals("testDockerImage", jobRunParameters.getDockerImageName());
|
||||||
|
|
||||||
|
assertNotNull(jobRunParameters.getLocalizations());
|
||||||
|
assertEquals(2, jobRunParameters.getLocalizations().size());
|
||||||
|
|
||||||
|
assertNotNull(jobRunParameters.getQuicklinks());
|
||||||
|
assertEquals(2, jobRunParameters.getQuicklinks().size());
|
||||||
|
|
||||||
|
assertTrue(SubmarineLogs.isVerbose());
|
||||||
|
assertTrue(jobRunParameters.isWaitJobFinish());
|
||||||
|
|
||||||
|
for (String env : expectedEnvs) {
|
||||||
|
assertTrue(String.format(
|
||||||
|
"%s should be in env list of jobRunParameters!", env),
|
||||||
|
jobRunParameters.getEnvars().contains(env));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyPsValues(RunJobParameters jobRunParameters,
|
||||||
|
String prefix) {
|
||||||
|
assertEquals(4, jobRunParameters.getNumPS());
|
||||||
|
assertEquals(prefix + "testLaunchCmdPs", jobRunParameters.getPSLaunchCmd());
|
||||||
|
assertEquals(prefix + "testDockerImagePs",
|
||||||
|
jobRunParameters.getPsDockerImage());
|
||||||
|
assertEquals(ResourceTypesTestHelper.newResource(20500L, 34,
|
||||||
|
ImmutableMap.<String, String> builder()
|
||||||
|
.put(ResourceInformation.GPU_URI, "4").build()),
|
||||||
|
jobRunParameters.getPsResource());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyWorkerValues(RunJobParameters jobRunParameters,
|
||||||
|
String prefix) {
|
||||||
|
assertEquals(3, jobRunParameters.getNumWorkers());
|
||||||
|
assertEquals(prefix + "testLaunchCmdWorker",
|
||||||
|
jobRunParameters.getWorkerLaunchCmd());
|
||||||
|
assertEquals(prefix + "testDockerImageWorker",
|
||||||
|
jobRunParameters.getWorkerDockerImage());
|
||||||
|
assertEquals(ResourceTypesTestHelper.newResource(20480L, 32,
|
||||||
|
ImmutableMap.<String, String> builder()
|
||||||
|
.put(ResourceInformation.GPU_URI, "2").build()),
|
||||||
|
jobRunParameters.getWorkerResource());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifySecurityValues(RunJobParameters jobRunParameters) {
|
||||||
|
assertEquals("keytabPath", jobRunParameters.getKeytab());
|
||||||
|
assertEquals("testPrincipal", jobRunParameters.getPrincipal());
|
||||||
|
assertTrue(jobRunParameters.isDistributeKeytab());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyTensorboardValues(RunJobParameters jobRunParameters) {
|
||||||
|
assertTrue(jobRunParameters.isTensorboardEnabled());
|
||||||
|
assertEquals("tensorboardDockerImage",
|
||||||
|
jobRunParameters.getTensorboardDockerImage());
|
||||||
|
assertEquals(ResourceTypesTestHelper.newResource(21000L, 37,
|
||||||
|
ImmutableMap.<String, String> builder()
|
||||||
|
.put(ResourceInformation.GPU_URI, "3").build()),
|
||||||
|
jobRunParameters.getTensorboardResource());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testValidYamlParsing() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||||
|
DIR_NAME + "/valid-config.yaml");
|
||||||
|
runJobCli.run(
|
||||||
|
new String[] {"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||||
|
|
||||||
|
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
||||||
|
verifyBasicConfigValues(jobRunParameters);
|
||||||
|
verifyPsValues(jobRunParameters, "");
|
||||||
|
verifyWorkerValues(jobRunParameters, "");
|
||||||
|
verifySecurityValues(jobRunParameters);
|
||||||
|
verifyTensorboardValues(jobRunParameters);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testYamlAndCliOptionIsDefinedIsInvalid() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||||
|
DIR_NAME + "/valid-config.yaml");
|
||||||
|
String[] args = new String[] {"--name", "my-job",
|
||||||
|
"--docker_image", "tf-docker:1.1.0",
|
||||||
|
"-f", yamlConfig.getAbsolutePath() };
|
||||||
|
|
||||||
|
exception.expect(YarnException.class);
|
||||||
|
exception.expectMessage("defined both with YAML config and with " +
|
||||||
|
"CLI argument");
|
||||||
|
|
||||||
|
runJobCli.run(args);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testYamlAndCliOptionIsDefinedIsInvalidWithListOption()
|
||||||
|
throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||||
|
DIR_NAME + "/valid-config.yaml");
|
||||||
|
String[] args = new String[] {"--name", "my-job",
|
||||||
|
"--quicklink", "AAA=http://master-0:8321",
|
||||||
|
"--quicklink", "BBB=http://worker-0:1234",
|
||||||
|
"-f", yamlConfig.getAbsolutePath()};
|
||||||
|
|
||||||
|
exception.expect(YarnException.class);
|
||||||
|
exception.expectMessage("defined both with YAML config and with " +
|
||||||
|
"CLI argument");
|
||||||
|
|
||||||
|
runJobCli.run(args);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRoleOverrides() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||||
|
DIR_NAME + "/valid-config-with-overrides.yaml");
|
||||||
|
runJobCli.run(
|
||||||
|
new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||||
|
|
||||||
|
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
||||||
|
verifyBasicConfigValues(jobRunParameters);
|
||||||
|
verifyPsValues(jobRunParameters, OVERRIDDEN_PREFIX);
|
||||||
|
verifyWorkerValues(jobRunParameters, OVERRIDDEN_PREFIX);
|
||||||
|
verifySecurityValues(jobRunParameters);
|
||||||
|
verifyTensorboardValues(jobRunParameters);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFalseValuesForBooleanFields() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||||
|
DIR_NAME + "/test-false-values.yaml");
|
||||||
|
runJobCli.run(
|
||||||
|
new String[] {"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||||
|
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
||||||
|
|
||||||
|
assertFalse(jobRunParameters.isDistributeKeytab());
|
||||||
|
assertFalse(jobRunParameters.isWaitJobFinish());
|
||||||
|
assertFalse(jobRunParameters.isTensorboardEnabled());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWrongIndentation() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||||
|
DIR_NAME + "/wrong-indentation.yaml");
|
||||||
|
|
||||||
|
exception.expect(YamlParseException.class);
|
||||||
|
exception.expectMessage("Failed to parse YAML config, details:");
|
||||||
|
runJobCli.run(
|
||||||
|
new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWrongFilename() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
exception.expect(YamlParseException.class);
|
||||||
|
runJobCli.run(
|
||||||
|
new String[]{"-f", "not-existing", "--verbose"});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testEmptyFile() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createEmptyTempFile();
|
||||||
|
|
||||||
|
exception.expect(YamlParseException.class);
|
||||||
|
runJobCli.run(
|
||||||
|
new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNotExistingFile() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
|
||||||
|
exception.expect(YamlParseException.class);
|
||||||
|
exception.expectMessage("file does not exist");
|
||||||
|
runJobCli.run(
|
||||||
|
new String[]{"-f", "blabla", "--verbose"});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWrongPropertyName() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||||
|
DIR_NAME + "/wrong-property-name.yaml");
|
||||||
|
|
||||||
|
exception.expect(YamlParseException.class);
|
||||||
|
exception.expectMessage("Failed to parse YAML config, details:");
|
||||||
|
runJobCli.run(
|
||||||
|
new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMissingConfigsSection() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||||
|
DIR_NAME + "/missing-configs.yaml");
|
||||||
|
|
||||||
|
exception.expect(YamlParseException.class);
|
||||||
|
exception.expectMessage("config section should be defined, " +
|
||||||
|
"but it cannot be found");
|
||||||
|
runJobCli.run(
|
||||||
|
new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMissingSectionsShouldParsed() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||||
|
DIR_NAME + "/some-sections-missing.yaml");
|
||||||
|
runJobCli.run(
|
||||||
|
new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMissingPrincipalUnderSecuritySection() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||||
|
DIR_NAME + "/security-principal-is-missing.yaml");
|
||||||
|
runJobCli.run(
|
||||||
|
new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||||
|
|
||||||
|
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
||||||
|
verifyBasicConfigValues(jobRunParameters);
|
||||||
|
verifyPsValues(jobRunParameters, "");
|
||||||
|
verifyWorkerValues(jobRunParameters, "");
|
||||||
|
verifyTensorboardValues(jobRunParameters);
|
||||||
|
|
||||||
|
//Verify security values
|
||||||
|
assertEquals("keytabPath", jobRunParameters.getKeytab());
|
||||||
|
assertNull("Principal should be null!", jobRunParameters.getPrincipal());
|
||||||
|
assertTrue(jobRunParameters.isDistributeKeytab());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMissingTensorBoardDockerImage() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||||
|
DIR_NAME + "/tensorboard-dockerimage-is-missing.yaml");
|
||||||
|
runJobCli.run(
|
||||||
|
new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||||
|
|
||||||
|
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
||||||
|
verifyBasicConfigValues(jobRunParameters);
|
||||||
|
verifyPsValues(jobRunParameters, "");
|
||||||
|
verifyWorkerValues(jobRunParameters, "");
|
||||||
|
verifySecurityValues(jobRunParameters);
|
||||||
|
|
||||||
|
assertTrue(jobRunParameters.isTensorboardEnabled());
|
||||||
|
assertNull("tensorboardDockerImage should be null!",
|
||||||
|
jobRunParameters.getTensorboardDockerImage());
|
||||||
|
assertEquals(ResourceTypesTestHelper.newResource(21000L, 37,
|
||||||
|
ImmutableMap.<String, String> builder()
|
||||||
|
.put(ResourceInformation.GPU_URI, "3").build()),
|
||||||
|
jobRunParameters.getTensorboardResource());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMissingEnvs() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
|
||||||
|
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||||
|
DIR_NAME + "/envs-are-missing.yaml");
|
||||||
|
runJobCli.run(
|
||||||
|
new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||||
|
|
||||||
|
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
||||||
|
verifyBasicConfigValues(jobRunParameters, ImmutableList.of());
|
||||||
|
verifyPsValues(jobRunParameters, "");
|
||||||
|
verifyWorkerValues(jobRunParameters, "");
|
||||||
|
verifySecurityValues(jobRunParameters);
|
||||||
|
verifyTensorboardValues(jobRunParameters);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,205 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.Configs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.Role;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.Roles;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.Scheduling;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.Security;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.Spec;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.TensorBoard;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlConfigFile;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils.readYamlConfigFile;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test class that verifies the correctness of YAML configuration parsing.
|
||||||
|
* Please note that this class just tests YAML parsing,
|
||||||
|
* but only in an isolated fashion.
|
||||||
|
*/
|
||||||
|
public class TestRunJobCliParsingYamlStandalone {
|
||||||
|
private static final String OVERRIDDEN_PREFIX = "overridden_";
|
||||||
|
private static final String DIR_NAME = "runjobcliparsing";
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void before() {
|
||||||
|
SubmarineLogs.verboseOff();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyBasicConfigValues(YamlConfigFile yamlConfigFile) {
|
||||||
|
assertNotNull("Spec file should not be null!", yamlConfigFile);
|
||||||
|
Spec spec = yamlConfigFile.getSpec();
|
||||||
|
assertNotNull("Spec should not be null!", spec);
|
||||||
|
|
||||||
|
assertEquals("testJobName", spec.getName());
|
||||||
|
assertEquals("testJobType", spec.getJobType());
|
||||||
|
|
||||||
|
Configs configs = yamlConfigFile.getConfigs();
|
||||||
|
assertNotNull("Configs should not be null!", configs);
|
||||||
|
|
||||||
|
assertEquals("testInputPath", configs.getInputPath());
|
||||||
|
assertEquals("testCheckpointPath", configs.getCheckpointPath());
|
||||||
|
assertEquals("testSavedModelPath", configs.getSavedModelPath());
|
||||||
|
assertEquals("testDockerImage", configs.getDockerImage());
|
||||||
|
|
||||||
|
Map<String, String> envs = configs.getEnvs();
|
||||||
|
assertNotNull("Envs should not be null!", envs);
|
||||||
|
assertEquals(2, envs.size());
|
||||||
|
assertEquals("env1Value", envs.get("env1"));
|
||||||
|
assertEquals("env2Value", envs.get("env2"));
|
||||||
|
|
||||||
|
List<String> localizations = configs.getLocalizations();
|
||||||
|
assertNotNull("Localizations should not be null!", localizations);
|
||||||
|
assertEquals("Size of localizations must be 2!", 2, localizations.size());
|
||||||
|
assertEquals("hdfs://remote-file1:/local-filename1:rw",
|
||||||
|
localizations.get(0));
|
||||||
|
assertEquals("nfs://remote-file2:/local-filename2:rw",
|
||||||
|
localizations.get(1));
|
||||||
|
|
||||||
|
List<String> mounts = configs.getMounts();
|
||||||
|
assertNotNull("Mounts should not be null!", mounts);
|
||||||
|
assertEquals("Size of mounts must be 2!", 2, mounts.size());
|
||||||
|
assertEquals("/etc/passwd:/etc/passwd:rw", mounts.get(0));
|
||||||
|
assertEquals("/etc/hosts:/etc/hosts:rw", mounts.get(1));
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
configs.getQuicklinks().contains("Notebook_UI=https://master-0:7070"));
|
||||||
|
assertTrue(
|
||||||
|
configs.getQuicklinks().contains("Notebook_UI2=https://master-0:7071"));
|
||||||
|
assertEquals("true", configs.getWaitJobFinish());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertRoleConfigOverrides(Role role, String prefix,
|
||||||
|
String roleType) {
|
||||||
|
assertNotNull(roleType + " role should not be null!", role);
|
||||||
|
|
||||||
|
assertEquals(String.format("%stestDockerImage%s", prefix, roleType),
|
||||||
|
role.getDockerImage());
|
||||||
|
|
||||||
|
//envs, localizations and mounts for Roles
|
||||||
|
// are only present in valid-config-with-overrides.yaml
|
||||||
|
boolean validateAll = !prefix.equals("");
|
||||||
|
if (validateAll) {
|
||||||
|
Map<String, String> envs = role.getEnvs();
|
||||||
|
assertNotNull("Envs should not be null!", envs);
|
||||||
|
assertEquals(String.format("%senv1%s", prefix, roleType),
|
||||||
|
envs.get("env1"));
|
||||||
|
assertEquals(String.format("%senv2%s", prefix, roleType),
|
||||||
|
envs.get("env2"));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (validateAll) {
|
||||||
|
List<String> localizations = role.getLocalizations();
|
||||||
|
assertNotNull("Localizations should not be null!", localizations);
|
||||||
|
assertEquals("Size of localizations must be 2!", 2, localizations.size());
|
||||||
|
assertEquals(String.format("hdfs://remote-file1:/%slocal" +
|
||||||
|
"-filename1%s:rw", prefix, roleType), localizations.get(0));
|
||||||
|
assertEquals(String.format("nfs://remote-file2:/%slocal" +
|
||||||
|
"-filename2%s:rw", prefix, roleType), localizations.get(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (validateAll) {
|
||||||
|
List<String> mounts = role.getMounts();
|
||||||
|
assertNotNull("Mounts should not be null!", mounts);
|
||||||
|
assertEquals("Size of mounts must be 2!", 2, mounts.size());
|
||||||
|
assertEquals(String.format("/etc/passwd:/%s%s", prefix, roleType),
|
||||||
|
mounts.get(0));
|
||||||
|
assertEquals(String.format("/etc/hosts:/%s%s", prefix, roleType),
|
||||||
|
mounts.get(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertWorkerValues(Role worker) {
|
||||||
|
assertEquals("testLaunchCmdWorker", worker.getLaunchCmd());
|
||||||
|
assertEquals("testDockerImageWorker", worker.getDockerImage());
|
||||||
|
assertEquals("memory=20480M,vcores=32,gpu=2", worker.getResources());
|
||||||
|
assertEquals(3, worker.getReplicas());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertPsValues(Role ps) {
|
||||||
|
assertEquals("testLaunchCmdPs", ps.getLaunchCmd());
|
||||||
|
assertEquals("testDockerImagePs", ps.getDockerImage());
|
||||||
|
assertEquals("memory=20500M,vcores=34,gpu=4", ps.getResources());
|
||||||
|
assertEquals(4, ps.getReplicas());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifySchedulingValues(YamlConfigFile yamlConfigFile) {
|
||||||
|
Scheduling scheduling = yamlConfigFile.getScheduling();
|
||||||
|
assertNotNull("Scheduling should not be null!", scheduling);
|
||||||
|
assertEquals("queue1", scheduling.getQueue());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifySecurityValues(YamlConfigFile yamlConfigFile) {
|
||||||
|
Security security = yamlConfigFile.getSecurity();
|
||||||
|
assertNotNull("Security should not be null!", security);
|
||||||
|
assertEquals("keytabPath", security.getKeytab());
|
||||||
|
assertEquals("testPrincipal", security.getPrincipal());
|
||||||
|
assertTrue(security.isDistributeKeytab());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyTensorboardValues(YamlConfigFile yamlConfigFile) {
|
||||||
|
TensorBoard tensorBoard = yamlConfigFile.getTensorBoard();
|
||||||
|
assertNotNull("Tensorboard should not be null!", tensorBoard);
|
||||||
|
assertEquals("tensorboardDockerImage", tensorBoard.getDockerImage());
|
||||||
|
assertEquals("memory=21000M,vcores=37,gpu=3", tensorBoard.getResources());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLaunchCommandYaml() {
|
||||||
|
YamlConfigFile yamlConfigFile = readYamlConfigFile(DIR_NAME +
|
||||||
|
"/valid-config.yaml");
|
||||||
|
|
||||||
|
verifyBasicConfigValues(yamlConfigFile);
|
||||||
|
|
||||||
|
Roles roles = yamlConfigFile.getRoles();
|
||||||
|
assertNotNull("Roles should not be null!", roles);
|
||||||
|
assertRoleConfigOverrides(roles.getWorker(), "", "Worker");
|
||||||
|
assertRoleConfigOverrides(roles.getPs(), "", "Ps");
|
||||||
|
|
||||||
|
assertWorkerValues(roles.getWorker());
|
||||||
|
assertPsValues(roles.getPs());
|
||||||
|
|
||||||
|
verifySchedulingValues(yamlConfigFile);
|
||||||
|
verifySecurityValues(yamlConfigFile);
|
||||||
|
verifyTensorboardValues(yamlConfigFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOverrides() {
|
||||||
|
YamlConfigFile yamlConfigFile = readYamlConfigFile(DIR_NAME +
|
||||||
|
"/valid-config-with-overrides.yaml");
|
||||||
|
|
||||||
|
verifyBasicConfigValues(yamlConfigFile);
|
||||||
|
|
||||||
|
Roles roles = yamlConfigFile.getRoles();
|
||||||
|
assertNotNull("Roles should not be null!", roles);
|
||||||
|
assertRoleConfigOverrides(roles.getWorker(), OVERRIDDEN_PREFIX, "Worker");
|
||||||
|
assertRoleConfigOverrides(roles.getPs(), OVERRIDDEN_PREFIX, "Ps");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,65 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli;
|
||||||
|
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters.UnderscoreConverterPropertyUtils;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlConfigFile;
|
||||||
|
import org.yaml.snakeyaml.Yaml;
|
||||||
|
import org.yaml.snakeyaml.constructor.Constructor;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test utility class for test code that deals with YAML configuration parsing.
|
||||||
|
*/
|
||||||
|
public final class YamlConfigTestUtils {
|
||||||
|
|
||||||
|
private YamlConfigTestUtils() {}
|
||||||
|
|
||||||
|
static void deleteFile(File file) {
|
||||||
|
if (file != null) {
|
||||||
|
file.delete();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static YamlConfigFile readYamlConfigFile(String filename) {
|
||||||
|
Constructor constructor = new Constructor(YamlConfigFile.class);
|
||||||
|
constructor.setPropertyUtils(new UnderscoreConverterPropertyUtils());
|
||||||
|
Yaml yaml = new Yaml(constructor);
|
||||||
|
InputStream inputStream = YamlConfigTestUtils.class
|
||||||
|
.getClassLoader()
|
||||||
|
.getResourceAsStream(filename);
|
||||||
|
return yaml.loadAs(inputStream, YamlConfigFile.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
static File createTempFileWithContents(String filename) throws IOException {
|
||||||
|
InputStream inputStream = YamlConfigTestUtils.class
|
||||||
|
.getClassLoader()
|
||||||
|
.getResourceAsStream(filename);
|
||||||
|
File targetFile = File.createTempFile("test", ".yaml");
|
||||||
|
FileUtils.copyInputStreamToFile(inputStream, targetFile);
|
||||||
|
return targetFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
static File createEmptyTempFile() throws IOException {
|
||||||
|
return File.createTempFile("test", ".yaml");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,59 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
spec:
|
||||||
|
name: testJobName
|
||||||
|
job_type: testJobType
|
||||||
|
|
||||||
|
configs:
|
||||||
|
input_path: testInputPath
|
||||||
|
checkpoint_path: testCheckpointPath
|
||||||
|
saved_model_path: testSavedModelPath
|
||||||
|
docker_image: testDockerImage
|
||||||
|
wait_job_finish: true
|
||||||
|
localizations:
|
||||||
|
- hdfs://remote-file1:/local-filename1:rw
|
||||||
|
- nfs://remote-file2:/local-filename2:rw
|
||||||
|
mounts:
|
||||||
|
- /etc/passwd:/etc/passwd:rw
|
||||||
|
- /etc/hosts:/etc/hosts:rw
|
||||||
|
quicklinks:
|
||||||
|
- Notebook_UI=https://master-0:7070
|
||||||
|
- Notebook_UI2=https://master-0:7071
|
||||||
|
|
||||||
|
scheduling:
|
||||||
|
queue: queue1
|
||||||
|
|
||||||
|
roles:
|
||||||
|
worker:
|
||||||
|
resources: memory=20480M,vcores=32,gpu=2
|
||||||
|
replicas: 3
|
||||||
|
launch_cmd: testLaunchCmdWorker
|
||||||
|
docker_image: testDockerImageWorker
|
||||||
|
ps:
|
||||||
|
resources: memory=20500M,vcores=34,gpu=4
|
||||||
|
replicas: 4
|
||||||
|
launch_cmd: testLaunchCmdPs
|
||||||
|
docker_image: testDockerImagePs
|
||||||
|
|
||||||
|
security:
|
||||||
|
keytab: keytabPath
|
||||||
|
principal: testPrincipal
|
||||||
|
distribute_keytab: true
|
||||||
|
|
||||||
|
tensorBoard:
|
||||||
|
resources: memory=21000M,vcores=37,gpu=3
|
||||||
|
docker_image: tensorboardDockerImage
|
@ -0,0 +1,41 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
spec:
|
||||||
|
name: testJobName
|
||||||
|
job_type: testJobType
|
||||||
|
|
||||||
|
scheduling:
|
||||||
|
queue: queue1
|
||||||
|
|
||||||
|
roles:
|
||||||
|
worker:
|
||||||
|
resources: memory=20480M,vcores=32,gpu=2
|
||||||
|
replicas: 3
|
||||||
|
launch_cmd: testLaunchCmdWorker
|
||||||
|
ps:
|
||||||
|
resources: memory=20500M,vcores=34,gpu=4
|
||||||
|
replicas: 4
|
||||||
|
launch_cmd: testLaunchCmdPs
|
||||||
|
|
||||||
|
security:
|
||||||
|
keytab: keytabPath
|
||||||
|
principal: testPrincipal
|
||||||
|
distribute_keytab: true
|
||||||
|
|
||||||
|
tensorBoard:
|
||||||
|
resources: memory=21000M,vcores=37,gpu=3
|
||||||
|
docker_image: tensorboardDockerImage
|
@ -0,0 +1,61 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
spec:
|
||||||
|
name: testJobName
|
||||||
|
job_type: testJobType
|
||||||
|
|
||||||
|
configs:
|
||||||
|
input_path: testInputPath
|
||||||
|
checkpoint_path: testCheckpointPath
|
||||||
|
saved_model_path: testSavedModelPath
|
||||||
|
docker_image: testDockerImage
|
||||||
|
wait_job_finish: true
|
||||||
|
envs:
|
||||||
|
env1: 'env1Value'
|
||||||
|
env2: 'env2Value'
|
||||||
|
localizations:
|
||||||
|
- hdfs://remote-file1:/local-filename1:rw
|
||||||
|
- nfs://remote-file2:/local-filename2:rw
|
||||||
|
mounts:
|
||||||
|
- /etc/passwd:/etc/passwd:rw
|
||||||
|
- /etc/hosts:/etc/hosts:rw
|
||||||
|
quicklinks:
|
||||||
|
- Notebook_UI=https://master-0:7070
|
||||||
|
- Notebook_UI2=https://master-0:7071
|
||||||
|
|
||||||
|
scheduling:
|
||||||
|
queue: queue1
|
||||||
|
|
||||||
|
roles:
|
||||||
|
worker:
|
||||||
|
resources: memory=20480M,vcores=32,gpu=2
|
||||||
|
replicas: 3
|
||||||
|
launch_cmd: testLaunchCmdWorker
|
||||||
|
docker_image: testDockerImageWorker
|
||||||
|
ps:
|
||||||
|
resources: memory=20500M,vcores=34,gpu=4
|
||||||
|
replicas: 4
|
||||||
|
launch_cmd: testLaunchCmdPs
|
||||||
|
docker_image: testDockerImagePs
|
||||||
|
|
||||||
|
security:
|
||||||
|
keytab: keytabPath
|
||||||
|
distribute_keytab: true
|
||||||
|
|
||||||
|
tensorBoard:
|
||||||
|
resources: memory=21000M,vcores=37,gpu=3
|
||||||
|
docker_image: tensorboardDockerImage
|
@ -0,0 +1,48 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
spec:
|
||||||
|
name: testJobName
|
||||||
|
job_type: testJobType
|
||||||
|
|
||||||
|
configs:
|
||||||
|
input_path: testInputPath
|
||||||
|
checkpoint_path: testCheckpointPath
|
||||||
|
saved_model_path: testSavedModelPath
|
||||||
|
docker_image: testDockerImage
|
||||||
|
quicklinks:
|
||||||
|
- Notebook_UI=https://master-0:7070
|
||||||
|
- Notebook_UI2=https://master-0:7071
|
||||||
|
wait_job_finish: true
|
||||||
|
envs:
|
||||||
|
env1: 'env1Value'
|
||||||
|
env2: 'env2Value'
|
||||||
|
localizations:
|
||||||
|
- hdfs://remote-file1:/local-filename1:rw
|
||||||
|
- nfs://remote-file2:/local-filename2:rw
|
||||||
|
mounts:
|
||||||
|
- /etc/passwd:/etc/passwd:rw
|
||||||
|
- /etc/hosts:/etc/hosts:rw
|
||||||
|
|
||||||
|
roles:
|
||||||
|
worker:
|
||||||
|
resources: memory=20480M,vcores=32,gpu=2
|
||||||
|
replicas: 3
|
||||||
|
launch_cmd: testLaunchCmdWorker
|
||||||
|
ps:
|
||||||
|
resources: memory=20500M,vcores=34,gpu=4
|
||||||
|
replicas: 4
|
||||||
|
launch_cmd: testLaunchCmdPs
|
@ -0,0 +1,61 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
spec:
|
||||||
|
name: testJobName
|
||||||
|
job_type: testJobType
|
||||||
|
|
||||||
|
configs:
|
||||||
|
input_path: testInputPath
|
||||||
|
checkpoint_path: testCheckpointPath
|
||||||
|
saved_model_path: testSavedModelPath
|
||||||
|
docker_image: testDockerImage
|
||||||
|
wait_job_finish: true
|
||||||
|
envs:
|
||||||
|
env1: 'env1Value'
|
||||||
|
env2: 'env2Value'
|
||||||
|
localizations:
|
||||||
|
- hdfs://remote-file1:/local-filename1:rw
|
||||||
|
- nfs://remote-file2:/local-filename2:rw
|
||||||
|
mounts:
|
||||||
|
- /etc/passwd:/etc/passwd:rw
|
||||||
|
- /etc/hosts:/etc/hosts:rw
|
||||||
|
quicklinks:
|
||||||
|
- Notebook_UI=https://master-0:7070
|
||||||
|
- Notebook_UI2=https://master-0:7071
|
||||||
|
|
||||||
|
scheduling:
|
||||||
|
queue: queue1
|
||||||
|
|
||||||
|
roles:
|
||||||
|
worker:
|
||||||
|
resources: memory=20480M,vcores=32,gpu=2
|
||||||
|
replicas: 3
|
||||||
|
launch_cmd: testLaunchCmdWorker
|
||||||
|
docker_image: testDockerImageWorker
|
||||||
|
ps:
|
||||||
|
resources: memory=20500M,vcores=34,gpu=4
|
||||||
|
replicas: 4
|
||||||
|
launch_cmd: testLaunchCmdPs
|
||||||
|
docker_image: testDockerImagePs
|
||||||
|
|
||||||
|
security:
|
||||||
|
keytab: keytabPath
|
||||||
|
principal: testPrincipal
|
||||||
|
distribute_keytab: true
|
||||||
|
|
||||||
|
tensorBoard:
|
||||||
|
resources: memory=21000M,vcores=37,gpu=3
|
@ -0,0 +1,56 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
spec:
|
||||||
|
name: testJobName
|
||||||
|
job_type: testJobType
|
||||||
|
|
||||||
|
configs:
|
||||||
|
input_path: testInputPath
|
||||||
|
checkpoint_path: testCheckpointPath
|
||||||
|
saved_model_path: testSavedModelPath
|
||||||
|
docker_image: testDockerImage
|
||||||
|
quicklinks:
|
||||||
|
- Notebook_UI=https://master-0:7070
|
||||||
|
- Notebook_UI2=https://master-0:7071
|
||||||
|
wait_job_finish: false
|
||||||
|
envs:
|
||||||
|
env1: 'env1Value'
|
||||||
|
env2: 'env2Value'
|
||||||
|
localizations:
|
||||||
|
- hdfs://remote-file1:/local-filename1:rw
|
||||||
|
- nfs://remote-file2:/local-filename2:rw
|
||||||
|
mounts:
|
||||||
|
- /etc/passwd:/etc/passwd:rw
|
||||||
|
- /etc/hosts:/etc/hosts:rw
|
||||||
|
|
||||||
|
scheduling:
|
||||||
|
queue: queue1
|
||||||
|
|
||||||
|
roles:
|
||||||
|
worker:
|
||||||
|
resources: memory=20480M,vcores=32,gpu=2
|
||||||
|
replicas: 3
|
||||||
|
launch_cmd: testLaunchCmdWorker
|
||||||
|
ps:
|
||||||
|
resources: memory=20500M,vcores=34,gpu=4
|
||||||
|
replicas: 4
|
||||||
|
launch_cmd: testLaunchCmdPs
|
||||||
|
|
||||||
|
security:
|
||||||
|
keytab: keytabPath
|
||||||
|
principal: testPrincipal
|
||||||
|
distribute_keytab: false
|
@ -0,0 +1,81 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
spec:
|
||||||
|
name: testJobName
|
||||||
|
job_type: testJobType
|
||||||
|
|
||||||
|
configs:
|
||||||
|
input_path: testInputPath
|
||||||
|
checkpoint_path: testCheckpointPath
|
||||||
|
saved_model_path: testSavedModelPath
|
||||||
|
docker_image: testDockerImage
|
||||||
|
wait_job_finish: true
|
||||||
|
envs:
|
||||||
|
env1: 'env1Value'
|
||||||
|
env2: 'env2Value'
|
||||||
|
localizations:
|
||||||
|
- hdfs://remote-file1:/local-filename1:rw
|
||||||
|
- nfs://remote-file2:/local-filename2:rw
|
||||||
|
mounts:
|
||||||
|
- /etc/passwd:/etc/passwd:rw
|
||||||
|
- /etc/hosts:/etc/hosts:rw
|
||||||
|
quicklinks:
|
||||||
|
- Notebook_UI=https://master-0:7070
|
||||||
|
- Notebook_UI2=https://master-0:7071
|
||||||
|
|
||||||
|
scheduling:
|
||||||
|
queue: queue1
|
||||||
|
|
||||||
|
roles:
|
||||||
|
worker:
|
||||||
|
resources: memory=20480M,vcores=32,gpu=2
|
||||||
|
replicas: 3
|
||||||
|
launch_cmd: overridden_testLaunchCmdWorker
|
||||||
|
docker_image: overridden_testDockerImageWorker
|
||||||
|
envs:
|
||||||
|
env1: 'overridden_env1Worker'
|
||||||
|
env2: 'overridden_env2Worker'
|
||||||
|
localizations:
|
||||||
|
- hdfs://remote-file1:/overridden_local-filename1Worker:rw
|
||||||
|
- nfs://remote-file2:/overridden_local-filename2Worker:rw
|
||||||
|
mounts:
|
||||||
|
- /etc/passwd:/overridden_Worker
|
||||||
|
- /etc/hosts:/overridden_Worker
|
||||||
|
|
||||||
|
ps:
|
||||||
|
resources: memory=20500M,vcores=34,gpu=4
|
||||||
|
replicas: 4
|
||||||
|
launch_cmd: overridden_testLaunchCmdPs
|
||||||
|
docker_image: overridden_testDockerImagePs
|
||||||
|
envs:
|
||||||
|
env1: 'overridden_env1Ps'
|
||||||
|
env2: 'overridden_env2Ps'
|
||||||
|
localizations:
|
||||||
|
- hdfs://remote-file1:/overridden_local-filename1Ps:rw
|
||||||
|
- nfs://remote-file2:/overridden_local-filename2Ps:rw
|
||||||
|
mounts:
|
||||||
|
- /etc/passwd:/overridden_Ps
|
||||||
|
- /etc/hosts:/overridden_Ps
|
||||||
|
|
||||||
|
security:
|
||||||
|
keytab: keytabPath
|
||||||
|
principal: testPrincipal
|
||||||
|
distribute_keytab: true
|
||||||
|
|
||||||
|
tensorBoard:
|
||||||
|
resources: memory=21000M,vcores=37,gpu=3
|
||||||
|
docker_image: tensorboardDockerImage
|
@ -0,0 +1,62 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
spec:
|
||||||
|
name: testJobName
|
||||||
|
job_type: testJobType
|
||||||
|
|
||||||
|
configs:
|
||||||
|
input_path: testInputPath
|
||||||
|
checkpoint_path: testCheckpointPath
|
||||||
|
saved_model_path: testSavedModelPath
|
||||||
|
docker_image: testDockerImage
|
||||||
|
wait_job_finish: true
|
||||||
|
envs:
|
||||||
|
env1: 'env1Value'
|
||||||
|
env2: 'env2Value'
|
||||||
|
localizations:
|
||||||
|
- hdfs://remote-file1:/local-filename1:rw
|
||||||
|
- nfs://remote-file2:/local-filename2:rw
|
||||||
|
mounts:
|
||||||
|
- /etc/passwd:/etc/passwd:rw
|
||||||
|
- /etc/hosts:/etc/hosts:rw
|
||||||
|
quicklinks:
|
||||||
|
- Notebook_UI=https://master-0:7070
|
||||||
|
- Notebook_UI2=https://master-0:7071
|
||||||
|
|
||||||
|
scheduling:
|
||||||
|
queue: queue1
|
||||||
|
|
||||||
|
roles:
|
||||||
|
worker:
|
||||||
|
resources: memory=20480M,vcores=32,gpu=2
|
||||||
|
replicas: 3
|
||||||
|
launch_cmd: testLaunchCmdWorker
|
||||||
|
docker_image: testDockerImageWorker
|
||||||
|
ps:
|
||||||
|
resources: memory=20500M,vcores=34,gpu=4
|
||||||
|
replicas: 4
|
||||||
|
launch_cmd: testLaunchCmdPs
|
||||||
|
docker_image: testDockerImagePs
|
||||||
|
|
||||||
|
security:
|
||||||
|
keytab: keytabPath
|
||||||
|
principal: testPrincipal
|
||||||
|
distribute_keytab: true
|
||||||
|
|
||||||
|
tensorBoard:
|
||||||
|
resources: memory=21000M,vcores=37,gpu=3
|
||||||
|
docker_image: tensorboardDockerImage
|
@ -0,0 +1,60 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
spec:
|
||||||
|
name: testJobName
|
||||||
|
job_type: testJobType
|
||||||
|
|
||||||
|
configs:
|
||||||
|
input_path: testInputPath
|
||||||
|
checkpoint_path: testCheckpointPath
|
||||||
|
saved_model_path: testSavedModelPath
|
||||||
|
docker_image: testDockerImage
|
||||||
|
quicklinks:
|
||||||
|
- Notebook_UI=https://master-0:7070
|
||||||
|
- Notebook_UI2=https://master-0:7071
|
||||||
|
wait_job_finish: true
|
||||||
|
envs:
|
||||||
|
env1: 'env1Value'
|
||||||
|
env2: 'env2Value'
|
||||||
|
localizations:
|
||||||
|
- hdfs://remote-file1:/local-filename1:rw
|
||||||
|
- nfs://remote-file2:/local-filename2:rw
|
||||||
|
mounts:
|
||||||
|
- /etc/passwd:/etc/passwd:rw
|
||||||
|
- /etc/hosts:/etc/hosts:rw
|
||||||
|
|
||||||
|
scheduling:
|
||||||
|
queue: queue1
|
||||||
|
|
||||||
|
roles:
|
||||||
|
worker:
|
||||||
|
resources: memory=20480M,vcores=32,gpu=2
|
||||||
|
replicas: 3
|
||||||
|
launch_cmd: testLaunchCmdWorker
|
||||||
|
ps:
|
||||||
|
resources: memory=20500M,vcores=34,gpu=4
|
||||||
|
replicas: 4
|
||||||
|
launch_cmd: testLaunchCmdPs
|
||||||
|
|
||||||
|
security:
|
||||||
|
keytab: keytabPath
|
||||||
|
principal: testPrincipal
|
||||||
|
distribute_keytab: true
|
||||||
|
|
||||||
|
tensorBoard:
|
||||||
|
resources: memory=21000M,vcores=37,gpu=3
|
||||||
|
docker_image: tensorboardDockerImage
|
@ -0,0 +1,60 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
spec:
|
||||||
|
name: testJobName
|
||||||
|
job_type: testJobType
|
||||||
|
|
||||||
|
CONFIGS:
|
||||||
|
input_path: testInputPath
|
||||||
|
checkpoint_path: testCheckpointPath
|
||||||
|
saved_model_path: testSavedModelPath
|
||||||
|
docker_image: testDockerImage
|
||||||
|
quicklinks:
|
||||||
|
- Notebook_UI=https://master-0:7070
|
||||||
|
- Notebook_UI2=https://master-0:7071
|
||||||
|
wait_job_finish: true
|
||||||
|
envs:
|
||||||
|
env1: 'env1Value'
|
||||||
|
env2: 'env2Value'
|
||||||
|
localizations:
|
||||||
|
- hdfs://remote-file1:/local-filename1:rw
|
||||||
|
- nfs://remote-file2:/local-filename2:rw
|
||||||
|
mounts:
|
||||||
|
- /etc/passwd:/etc/passwd:rw
|
||||||
|
- /etc/hosts:/etc/hosts:rw
|
||||||
|
|
||||||
|
scheduling:
|
||||||
|
queue: queue1
|
||||||
|
|
||||||
|
roles:
|
||||||
|
worker:
|
||||||
|
resources: memory=20480M,vcores=32,gpu=2
|
||||||
|
replicas: 3
|
||||||
|
launch_cmd: testLaunchCmdWorker
|
||||||
|
ps:
|
||||||
|
resources: memory=20500M,vcores=34,gpu=4
|
||||||
|
replicas: 4
|
||||||
|
launch_cmd: testLaunchCmdPs
|
||||||
|
|
||||||
|
security:
|
||||||
|
keytab: keytabPath
|
||||||
|
principal: testPrincipal
|
||||||
|
distribute_keytab: true
|
||||||
|
|
||||||
|
tensorBoard:
|
||||||
|
resources: memory=21000M,vcores=37,gpu=3
|
||||||
|
docker_image: tensorboardDockerImage
|
Loading…
Reference in New Issue
Block a user