From 0f25a1bb52bc56661fd020a6ba82df99f8c6ef1f Mon Sep 17 00:00:00 2001 From: Varun Vasudev Date: Fri, 29 Apr 2016 16:09:07 +0530 Subject: [PATCH] YARN-3998. Add support in the NodeManager to re-launch containers. Contributed by Jun Gong. --- .../api/records/ContainerLaunchContext.java | 33 +++ .../api/records/ContainerRetryContext.java | 84 ++++++ .../api/records/ContainerRetryPolicy.java | 35 +++ .../hadoop/yarn/conf/YarnConfiguration.java | 8 + .../src/main/proto/yarn_protos.proto | 14 + .../distributedshell/ApplicationMaster.java | 43 ++- .../applications/distributedshell/Client.java | 34 +++ .../impl/pb/ContainerLaunchContextPBImpl.java | 42 ++- .../impl/pb/ContainerRetryContextPBImpl.java | 177 ++++++++++++ .../yarn/api/records/impl/pb/ProtoUtils.java | 15 + .../src/main/resources/yarn-default.xml | 7 + .../hadoop/yarn/api/TestPBImplRecords.java | 12 +- .../server/nodemanager/ContainerExecutor.java | 1 + .../nodemanager/LocalDirsHandlerService.java | 22 ++ .../ContainerManagerImpl.java | 3 +- .../containermanager/container/Container.java | 12 + .../container/ContainerImpl.java | 185 ++++++++++++- .../container/ContainerState.java | 6 +- .../launcher/ContainerLaunch.java | 258 +++++++++++------- .../launcher/ContainerRelaunch.java | 196 +++++++++++++ .../launcher/ContainersLauncher.java | 10 + .../launcher/ContainersLauncherEventType.java | 1 + .../recovery/NMLeveldbStateStoreService.java | 47 ++++ .../recovery/NMNullStateStoreService.java | 15 + .../recovery/NMStateStoreService.java | 59 ++++ .../container/TestContainer.java | 85 +++++- .../recovery/NMMemoryStateStoreService.java | 24 ++ .../TestNMLeveldbStateStoreService.java | 12 + .../nodemanager/webapp/MockContainer.java | 28 ++ 29 files changed, 1352 insertions(+), 116 deletions(-) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerRetryContext.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerRetryPolicy.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ContainerRetryContextPBImpl.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerRelaunch.java diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerLaunchContext.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerLaunchContext.java index 932945bdd1..6d4bccd80c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerLaunchContext.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerLaunchContext.java @@ -24,6 +24,7 @@ import org.apache.hadoop.classification.InterfaceAudience.Public; import org.apache.hadoop.classification.InterfaceStability.Stable; +import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.yarn.api.ContainerManagementProtocol; import org.apache.hadoop.yarn.server.api.ApplicationInitializationContext; import org.apache.hadoop.yarn.server.api.AuxiliaryService; @@ -46,6 +47,7 @@ *
  • Optional, application-specific binary service data.
  • *
  • Environment variables for the launched process.
  • *
  • Command to launch the container.
  • + *
  • Retry strategy when container exits with failure.
  • * * * @see ContainerManagementProtocol#startContainers(org.apache.hadoop.yarn.api.protocolrecords.StartContainersRequest) @@ -61,6 +63,18 @@ public static ContainerLaunchContext newInstance( Map environment, List commands, Map serviceData, ByteBuffer tokens, Map acls) { + return newInstance(localResources, environment, commands, serviceData, + tokens, acls, null); + } + + @Public + @Unstable + public static ContainerLaunchContext newInstance( + Map localResources, + Map environment, List commands, + Map serviceData, ByteBuffer tokens, + Map acls, + ContainerRetryContext containerRetryContext) { ContainerLaunchContext container = Records.newRecord(ContainerLaunchContext.class); container.setLocalResources(localResources); @@ -69,6 +83,7 @@ public static ContainerLaunchContext newInstance( container.setServiceData(serviceData); container.setTokens(tokens); container.setApplicationACLs(acls); + container.setContainerRetryContext(containerRetryContext); return container; } @@ -195,4 +210,22 @@ public static ContainerLaunchContext newInstance( @Public @Stable public abstract void setApplicationACLs(Map acls); + + /** + * Get the ContainerRetryContext to relaunch container. + * @return ContainerRetryContext to relaunch container. + */ + @Public + @Unstable + public abstract ContainerRetryContext getContainerRetryContext(); + + /** + * Set the ContainerRetryContext to relaunch container. + * @param containerRetryContext ContainerRetryContext to + * relaunch container. + */ + @Public + @Unstable + public abstract void setContainerRetryContext( + ContainerRetryContext containerRetryContext); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerRetryContext.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerRetryContext.java new file mode 100644 index 0000000000..ef8bd1763e --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerRetryContext.java @@ -0,0 +1,84 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.api.records; + +import org.apache.hadoop.classification.InterfaceAudience.Private; +import org.apache.hadoop.classification.InterfaceAudience.Public; +import org.apache.hadoop.classification.InterfaceStability.Unstable; +import org.apache.hadoop.yarn.util.Records; + +import java.util.Set; + +/** + * {@code ContainerRetryContext} indicates how container retry after it fails + * to run. + *

    + * It provides details such as: + *

      + *
    • + * {@link ContainerRetryPolicy} : + * - NEVER_RETRY(DEFAULT value): no matter what error code is when container + * fails to run, just do not retry. + * - RETRY_ON_ALL_ERRORS: no matter what error code is, when container fails + * to run, just retry. + * - RETRY_ON_SPECIFIC_ERROR_CODES: when container fails to run, do retry if + * the error code is one of errorCodes, otherwise do not retry. + * + * Note: if error code is 137(SIGKILL) or 143(SIGTERM), it will not retry + * because it is usually killed on purpose. + *
    • + *
    • + * maxRetries specifies how many times to retry if need to retry. + * If the value is -1, it means retry forever. + *
    • + *
    • retryInterval specifies delaying some time before relaunch + * container, the unit is millisecond.
    • + *
    + */ +@Public +@Unstable +public abstract class ContainerRetryContext { + public static final int RETRY_FOREVER = -1; + public static final int RETRY_INVALID = -1000; + public static final ContainerRetryContext NEVER_RETRY_CONTEXT = + newInstance(ContainerRetryPolicy.NEVER_RETRY, null, 0, 0); + + @Private + @Unstable + public static ContainerRetryContext newInstance( + ContainerRetryPolicy retryPolicy, Set errorCodes, + int maxRetries, int retryInterval) { + ContainerRetryContext containerRetryContext = + Records.newRecord(ContainerRetryContext.class); + containerRetryContext.setRetryPolicy(retryPolicy); + containerRetryContext.setErrorCodes(errorCodes); + containerRetryContext.setMaxRetries(maxRetries); + containerRetryContext.setRetryInterval(retryInterval); + return containerRetryContext; + } + + public abstract ContainerRetryPolicy getRetryPolicy(); + public abstract void setRetryPolicy(ContainerRetryPolicy retryPolicy); + public abstract Set getErrorCodes(); + public abstract void setErrorCodes(Set errorCodes); + public abstract int getMaxRetries(); + public abstract void setMaxRetries(int maxRetries); + public abstract int getRetryInterval(); + public abstract void setRetryInterval(int retryInterval); +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerRetryPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerRetryPolicy.java new file mode 100644 index 0000000000..75c9d105af --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerRetryPolicy.java @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.api.records; + +import org.apache.hadoop.classification.InterfaceAudience.Public; +import org.apache.hadoop.classification.InterfaceStability.Unstable; + +/** + *

    Retry policy for relaunching a Container.

    + */ +@Public +@Unstable +public enum ContainerRetryPolicy { + /** Never retry. */ + NEVER_RETRY, + /** Retry for all error codes. */ + RETRY_ON_ALL_ERRORS, + /** Retry for specific error codes. */ + RETRY_ON_SPECIFIC_ERROR_CODES +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 75965dd379..a4213cef44 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -846,6 +846,14 @@ public static boolean isAclEnabled(Configuration conf) { NM_PREFIX + "resourcemanager.minimum.version"; public static final String DEFAULT_NM_RESOURCEMANAGER_MINIMUM_VERSION = "NONE"; + /** + * Maximum size of contain's diagnostics to keep for relaunching container + * case. + **/ + public static final String NM_CONTAINER_DIAGNOSTICS_MAXIMUM_SIZE = + NM_PREFIX + "container-diagnostics-maximum-size"; + public static final int DEFAULT_NM_CONTAINER_DIAGNOSTICS_MAXIMUM_SIZE = 10000; + /** Interval at which the delayed token removal thread runs */ public static final String RM_DELAYED_DELEGATION_TOKEN_REMOVAL_INTERVAL_MS = RM_PREFIX + "delayed.delegation-token.removal-interval-ms"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto index 635f2f061b..60cdfd155e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto @@ -511,6 +511,7 @@ message ContainerLaunchContextProto { repeated StringStringMapProto environment = 4; repeated string command = 5; repeated ApplicationACLMapProto application_ACLs = 6; + optional ContainerRetryContextProto container_retry_context = 7; } message ContainerStatusProto { @@ -534,6 +535,19 @@ message ContainerResourceChangeRequestProto { optional ResourceProto capability = 2; } +message ContainerRetryContextProto { + optional ContainerRetryPolicyProto retry_policy = 1 [default = NEVER_RETRY]; + repeated int32 error_codes = 2; + optional int32 max_retries = 3 [default = 0]; + optional int32 retry_interval = 4 [default = 0]; +} + +enum ContainerRetryPolicyProto { + NEVER_RETRY = 0; + RETRY_ON_ALL_ERRORS = 1; + RETRY_ON_SPECIFIC_ERROR_CODES = 2; +} + //////////////////////////////////////////////////////////////////////// ////// From common////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java index 2b85ba8dc8..297397421d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java @@ -32,6 +32,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -76,6 +77,8 @@ import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.ContainerRetryContext; +import org.apache.hadoop.yarn.api.records.ContainerRetryPolicy; import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; @@ -256,6 +259,13 @@ public static enum DSEntity { // File length needed for local resource private long shellScriptPathLen = 0; + // Container retry options + private ContainerRetryPolicy containerRetryPolicy = + ContainerRetryPolicy.NEVER_RETRY; + private Set containerRetryErrorCodes = null; + private int containerMaxRetries = 0; + private int containrRetryInterval = 0; + // Timeline domain ID private String domainId = null; @@ -378,6 +388,18 @@ public boolean init(String[] args) throws ParseException, IOException { opts.addOption("num_containers", true, "No. of containers on which the shell command needs to be executed"); opts.addOption("priority", true, "Application Priority. Default 0"); + opts.addOption("container_retry_policy", true, + "Retry policy when container fails to run, " + + "0: NEVER_RETRY, 1: RETRY_ON_ALL_ERRORS, " + + "2: RETRY_ON_SPECIFIC_ERROR_CODES"); + opts.addOption("container_retry_error_codes", true, + "When retry policy is set to RETRY_ON_SPECIFIC_ERROR_CODES, error " + + "codes is specified with this option, " + + "e.g. --container_retry_error_codes 1,2,3"); + opts.addOption("container_max_retries", true, + "If container could retry, it specifies max retires"); + opts.addOption("container_retry_interval", true, + "Interval between each retry, unit is milliseconds"); opts.addOption("debug", false, "Dump out debug information"); opts.addOption("help", false, "Print usage"); @@ -515,6 +537,21 @@ public boolean init(String[] args) throws ParseException, IOException { } requestPriority = Integer.parseInt(cliParser .getOptionValue("priority", "0")); + + containerRetryPolicy = ContainerRetryPolicy.values()[ + Integer.parseInt(cliParser.getOptionValue( + "container_retry_policy", "0"))]; + if (cliParser.hasOption("container_retry_error_codes")) { + containerRetryErrorCodes = new HashSet<>(); + for (String errorCode : + cliParser.getOptionValue("container_retry_error_codes").split(",")) { + containerRetryErrorCodes.add(Integer.parseInt(errorCode)); + } + } + containerMaxRetries = Integer.parseInt( + cliParser.getOptionValue("container_max_retries", "0")); + containrRetryInterval = Integer.parseInt(cliParser.getOptionValue( + "container_retry_interval", "0")); return true; } @@ -1069,9 +1106,13 @@ public void run() { // "hadoop dfs" command inside the distributed shell. Map myShellEnv = new HashMap(shellEnv); myShellEnv.put(YARN_SHELL_ID, shellId); + ContainerRetryContext containerRetryContext = + ContainerRetryContext.newInstance( + containerRetryPolicy, containerRetryErrorCodes, + containerMaxRetries, containrRetryInterval); ContainerLaunchContext ctx = ContainerLaunchContext.newInstance( localResources, myShellEnv, commands, null, allTokens.duplicate(), - null); + null, containerRetryContext); containerListener.addContainer(container.getId(), container); nmClientAsync.startContainerAsync(container, ctx); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/Client.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/Client.java index e864ad2300..9139b08e53 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/Client.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/Client.java @@ -169,6 +169,8 @@ public class Client { private long attemptFailuresValidityInterval = -1; + private Vector containerRetryOptions = new Vector<>(5); + // Debug flag boolean debugFlag = false; @@ -288,6 +290,18 @@ public Client(Configuration conf) throws Exception { + " will be allocated, \"\" means containers" + " can be allocated anywhere, if you don't specify the option," + " default node_label_expression of queue will be used."); + opts.addOption("container_retry_policy", true, + "Retry policy when container fails to run, " + + "0: NEVER_RETRY, 1: RETRY_ON_ALL_ERRORS, " + + "2: RETRY_ON_SPECIFIC_ERROR_CODES"); + opts.addOption("container_retry_error_codes", true, + "When retry policy is set to RETRY_ON_SPECIFIC_ERROR_CODES, error " + + "codes is specified with this option, " + + "e.g. --container_retry_error_codes 1,2,3"); + opts.addOption("container_max_retries", true, + "If container could retry, it specifies max retires"); + opts.addOption("container_retry_interval", true, + "Interval between each retry, unit is milliseconds"); } /** @@ -430,6 +444,24 @@ public boolean init(String[] args) throws ParseException { } } + // Get container retry options + if (cliParser.hasOption("container_retry_policy")) { + containerRetryOptions.add("--container_retry_policy " + + cliParser.getOptionValue("container_retry_policy")); + } + if (cliParser.hasOption("container_retry_error_codes")) { + containerRetryOptions.add("--container_retry_error_codes " + + cliParser.getOptionValue("container_retry_error_codes")); + } + if (cliParser.hasOption("container_max_retries")) { + containerRetryOptions.add("--container_max_retries " + + cliParser.getOptionValue("container_max_retries")); + } + if (cliParser.hasOption("container_retry_interval")) { + containerRetryOptions.add("--container_retry_interval " + + cliParser.getOptionValue("container_retry_interval")); + } + return true; } @@ -639,6 +671,8 @@ public boolean run() throws IOException, YarnException { vargs.add("--debug"); } + vargs.addAll(containerRetryOptions); + vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stdout"); vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stderr"); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ContainerLaunchContextPBImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ContainerLaunchContextPBImpl.java index 30403caa97..1efe541710 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ContainerLaunchContextPBImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ContainerLaunchContextPBImpl.java @@ -29,10 +29,12 @@ import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.yarn.api.records.ApplicationAccessType; import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.ContainerRetryContext; import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationACLMapProto; import org.apache.hadoop.yarn.proto.YarnProtos.ContainerLaunchContextProto; import org.apache.hadoop.yarn.proto.YarnProtos.ContainerLaunchContextProtoOrBuilder; +import org.apache.hadoop.yarn.proto.YarnProtos.ContainerRetryContextProto; import org.apache.hadoop.yarn.proto.YarnProtos.LocalResourceProto; import org.apache.hadoop.yarn.proto.YarnProtos.StringBytesMapProto; import org.apache.hadoop.yarn.proto.YarnProtos.StringLocalResourceMapProto; @@ -56,7 +58,8 @@ public class ContainerLaunchContextPBImpl private Map environment = null; private List commands = null; private Map applicationACLS = null; - + private ContainerRetryContext containerRetryContext = null; + public ContainerLaunchContextPBImpl() { builder = ContainerLaunchContextProto.newBuilder(); } @@ -120,6 +123,10 @@ private void mergeLocalToBuilder() { if (this.applicationACLS != null) { addApplicationACLs(); } + if (this.containerRetryContext != null) { + builder.setContainerRetryContext( + convertToProtoFormat(this.containerRetryContext)); + } } private void mergeLocalToProto() { @@ -462,6 +469,27 @@ public void setApplicationACLs( this.applicationACLS.putAll(appACLs); } + public ContainerRetryContext getContainerRetryContext() { + ContainerLaunchContextProtoOrBuilder p = viaProto ? proto : builder; + if (this.containerRetryContext != null) { + return this.containerRetryContext; + } + if (!p.hasContainerRetryContext()) { + return null; + } + this.containerRetryContext = convertFromProtoFormat( + p.getContainerRetryContext()); + return this.containerRetryContext; + } + + public void setContainerRetryContext(ContainerRetryContext retryContext) { + maybeInitBuilder(); + if (retryContext == null) { + builder.clearContainerRetryContext(); + } + this.containerRetryContext = retryContext; + } + private LocalResourcePBImpl convertFromProtoFormat(LocalResourceProto p) { return new LocalResourcePBImpl(p); } @@ -469,4 +497,14 @@ private LocalResourcePBImpl convertFromProtoFormat(LocalResourceProto p) { private LocalResourceProto convertToProtoFormat(LocalResource t) { return ((LocalResourcePBImpl)t).getProto(); } -} + + private ContainerRetryContextPBImpl convertFromProtoFormat( + ContainerRetryContextProto p) { + return new ContainerRetryContextPBImpl(p); + } + + private ContainerRetryContextProto convertToProtoFormat( + ContainerRetryContext t) { + return ((ContainerRetryContextPBImpl)t).getProto(); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ContainerRetryContextPBImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ContainerRetryContextPBImpl.java new file mode 100644 index 0000000000..a5ef70de2f --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ContainerRetryContextPBImpl.java @@ -0,0 +1,177 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.api.records.impl.pb; + + +import com.google.protobuf.TextFormat; +import org.apache.hadoop.yarn.api.records.ContainerRetryContext; +import org.apache.hadoop.yarn.api.records.ContainerRetryPolicy; +import org.apache.hadoop.yarn.proto.YarnProtos.ContainerRetryPolicyProto; +import org.apache.hadoop.yarn.proto.YarnProtos.ContainerRetryContextProto; +import org.apache.hadoop.yarn.proto.YarnProtos.ContainerRetryContextProtoOrBuilder; + +import java.util.HashSet; +import java.util.Set; + +/** + * Implementation of ContainerRetryContext. + */ +public class ContainerRetryContextPBImpl extends ContainerRetryContext { + private ContainerRetryContextProto proto = + ContainerRetryContextProto.getDefaultInstance(); + private ContainerRetryContextProto.Builder builder = null; + private boolean viaProto = false; + + private Set errorCodes = null; + + public ContainerRetryContextPBImpl() { + builder = ContainerRetryContextProto.newBuilder(); + } + + public ContainerRetryContextPBImpl(ContainerRetryContextProto proto) { + this.proto = proto; + viaProto = true; + } + + public ContainerRetryContextProto getProto() { + mergeLocalToProto(); + proto = viaProto ? proto : builder.build(); + viaProto = true; + return proto; + } + + @Override + public int hashCode() { + return getProto().hashCode(); + } + + @Override + public boolean equals(Object other) { + if (other == null) { + return false; + } + if (other.getClass().isAssignableFrom(this.getClass())) { + return this.getProto().equals(this.getClass().cast(other).getProto()); + } + return false; + } + + @Override + public String toString() { + return TextFormat.shortDebugString(getProto()); + } + + private void mergeLocalToBuilder() { + if (this.errorCodes != null) { + builder.clearErrorCodes(); + builder.addAllErrorCodes(this.errorCodes); + } + } + + private void mergeLocalToProto() { + if (viaProto) { + maybeInitBuilder(); + } + mergeLocalToBuilder(); + proto = builder.build(); + viaProto = true; + } + + private void maybeInitBuilder() { + if (viaProto || builder == null) { + builder = ContainerRetryContextProto.newBuilder(proto); + } + viaProto = false; + } + + public ContainerRetryPolicy getRetryPolicy() { + ContainerRetryContextProtoOrBuilder p = viaProto ? proto : builder; + if (!p.hasRetryPolicy()) { + return ContainerRetryPolicy.NEVER_RETRY; + } + return convertFromProtoFormat(p.getRetryPolicy()); + } + + public void setRetryPolicy(ContainerRetryPolicy containerRetryPolicy) { + maybeInitBuilder(); + if (containerRetryPolicy == null) { + builder.clearRetryPolicy(); + return; + } + builder.setRetryPolicy(convertToProtoFormat(containerRetryPolicy)); + } + + private void initErrorCodes() { + if (this.errorCodes != null) { + return; + } + ContainerRetryContextProtoOrBuilder p = viaProto ? proto : builder; + this.errorCodes = new HashSet<>(); + this.errorCodes.addAll(p.getErrorCodesList()); + } + + public Set getErrorCodes() { + initErrorCodes(); + return this.errorCodes; + } + + public void setErrorCodes(Set errCodes) { + maybeInitBuilder(); + if (errCodes == null || errCodes.isEmpty()) { + builder.clearErrorCodes(); + } + this.errorCodes = errCodes; + } + + public int getMaxRetries() { + ContainerRetryContextProtoOrBuilder p = viaProto ? proto : builder; + if (!p.hasMaxRetries()) { + return 0; + } + return p.getMaxRetries(); + } + + public void setMaxRetries(int maxRetries) { + maybeInitBuilder(); + builder.setMaxRetries(maxRetries); + } + + public int getRetryInterval() { + ContainerRetryContextProtoOrBuilder p = viaProto ? proto : builder; + if (!p.hasRetryInterval()) { + return 0; + } + return p.getRetryInterval(); + } + + public void setRetryInterval(int retryInterval) { + maybeInitBuilder(); + builder.setRetryInterval(retryInterval); + } + + private ContainerRetryPolicyProto convertToProtoFormat( + ContainerRetryPolicy containerRetryPolicy) { + return ProtoUtils.convertToProtoFormat(containerRetryPolicy); + } + + private ContainerRetryPolicy convertFromProtoFormat( + ContainerRetryPolicyProto containerRetryPolicyProto) { + return ProtoUtils.convertFromProtoFormat(containerRetryPolicyProto); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ProtoUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ProtoUtils.java index 9d683f1924..236df90d92 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ProtoUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ProtoUtils.java @@ -26,6 +26,7 @@ import org.apache.hadoop.yarn.api.records.AMCommand; import org.apache.hadoop.yarn.api.records.ApplicationAccessType; import org.apache.hadoop.yarn.api.records.ApplicationResourceUsageReport; +import org.apache.hadoop.yarn.api.records.ContainerRetryPolicy; import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ExecutionType; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; @@ -56,6 +57,7 @@ import org.apache.hadoop.yarn.proto.YarnProtos.ReservationRequestInterpreterProto; import org.apache.hadoop.yarn.proto.YarnProtos.YarnApplicationAttemptStateProto; import org.apache.hadoop.yarn.proto.YarnProtos.YarnApplicationStateProto; +import org.apache.hadoop.yarn.proto.YarnProtos.ContainerRetryPolicyProto; import org.apache.hadoop.yarn.proto.YarnProtos.ContainerTypeProto; import org.apache.hadoop.yarn.proto.YarnProtos.ExecutionTypeProto; import org.apache.hadoop.yarn.proto.YarnServiceProtos; @@ -309,4 +311,17 @@ public static Resource convertFromProtoFormat( YarnProtos.ResourceProto resource) { return new ResourcePBImpl(resource); } + + /* + * ContainerRetryPolicy + */ + public static ContainerRetryPolicyProto convertToProtoFormat( + ContainerRetryPolicy e) { + return ContainerRetryPolicyProto.valueOf(e.name()); + } + + public static ContainerRetryPolicy convertFromProtoFormat( + ContainerRetryPolicyProto e) { + return ContainerRetryPolicy.valueOf(e.name()); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 1c5ee9535a..2be402ab6f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -1581,6 +1581,13 @@ NONE + + Maximum size of contain's diagnostics to keep for relaunching + container case. + yarn.nodemanager.container-diagnostics-maximum-size + 10000 + + Max number of threads in NMClientAsync to process container management events diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/api/TestPBImplRecords.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/api/TestPBImplRecords.java index 07b06fa8fe..14f61b736e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/api/TestPBImplRecords.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/api/TestPBImplRecords.java @@ -121,6 +121,7 @@ import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; import org.apache.hadoop.yarn.api.records.ContainerReport; import org.apache.hadoop.yarn.api.records.ContainerResourceChangeRequest; +import org.apache.hadoop.yarn.api.records.ContainerRetryContext; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.hadoop.yarn.api.records.LogAggregationContext; @@ -165,6 +166,7 @@ import org.apache.hadoop.yarn.api.records.impl.pb.ContainerPBImpl; import org.apache.hadoop.yarn.api.records.impl.pb.ContainerReportPBImpl; import org.apache.hadoop.yarn.api.records.impl.pb.ContainerResourceChangeRequestPBImpl; +import org.apache.hadoop.yarn.api.records.impl.pb.ContainerRetryContextPBImpl; import org.apache.hadoop.yarn.api.records.impl.pb.ContainerStatusPBImpl; import org.apache.hadoop.yarn.api.records.impl.pb.LocalResourcePBImpl; import org.apache.hadoop.yarn.api.records.impl.pb.NMTokenPBImpl; @@ -199,6 +201,7 @@ import org.apache.hadoop.yarn.proto.YarnProtos.ContainerProto; import org.apache.hadoop.yarn.proto.YarnProtos.ContainerReportProto; import org.apache.hadoop.yarn.proto.YarnProtos.ContainerResourceChangeRequestProto; +import org.apache.hadoop.yarn.proto.YarnProtos.ContainerRetryContextProto; import org.apache.hadoop.yarn.proto.YarnProtos.ContainerStatusProto; import org.apache.hadoop.yarn.proto.YarnProtos.LocalResourceProto; import org.apache.hadoop.yarn.proto.YarnProtos.NodeIdProto; @@ -354,7 +357,7 @@ private static Object genTypeValue(Type type) { return rand.nextBoolean(); } else if (type.equals(byte.class)) { return bytes[rand.nextInt(4)]; - } else if (type.equals(int.class)) { + } else if (type.equals(int.class) || type.equals(Integer.class)) { return rand.nextInt(1000000); } else if (type.equals(long.class)) { return Long.valueOf(rand.nextInt(1000000)); @@ -478,6 +481,7 @@ public static void setup() throws Exception { generateByNewInstance(ApplicationResourceUsageReport.class); generateByNewInstance(ApplicationReport.class); generateByNewInstance(Container.class); + generateByNewInstance(ContainerRetryContext.class); generateByNewInstance(ContainerLaunchContext.class); generateByNewInstance(ApplicationSubmissionContext.class); generateByNewInstance(ContainerReport.class); @@ -968,6 +972,12 @@ public void testContainerIdPBImpl() throws Exception { validatePBImplRecord(ContainerIdPBImpl.class, ContainerIdProto.class); } + @Test + public void testContainerRetryPBImpl() throws Exception { + validatePBImplRecord(ContainerRetryContextPBImpl.class, + ContainerRetryContextProto.class); + } + @Test public void testContainerLaunchContextPBImpl() throws Exception { validatePBImplRecord(ContainerLaunchContextPBImpl.class, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java index 8c74bf5831..d08ee67311 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java @@ -308,6 +308,7 @@ public void writeLaunchEnv(OutputStream out, } public enum ExitCode { + SUCCESS(0), FORCE_KILLED(137), TERMINATED(143), LOST(154); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java index 5cc4e19287..f8cb4eee70 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java @@ -408,6 +408,24 @@ public long getLastDisksCheckTime() { return lastDisksCheckTime; } + public boolean isGoodLocalDir(String path) { + return isInGoodDirs(getLocalDirs(), path); + } + + public boolean isGoodLogDir(String path) { + return isInGoodDirs(getLogDirs(), path); + } + + private boolean isInGoodDirs(List goodDirs, String path) { + for (String goodDir : goodDirs) { + if (path.startsWith(goodDir)) { + return true; + } + } + + return false; + } + /** * Set good local dirs and good log dirs in the configuration so that the * LocalDirAllocator objects will use this updated configuration only. @@ -551,6 +569,10 @@ public Path getLocalPathForWrite(String pathStr, long size, checkWrite); } + public Path getLocalPathForRead(String pathStr) throws IOException { + return getPathToRead(pathStr, getLocalDirsForRead()); + } + public Path getLogPathForWrite(String pathStr, boolean checkWrite) throws IOException { return logDirsAllocator.getLocalPathForWrite(pathStr, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java index 29ab7f997a..162823c9cd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java @@ -354,8 +354,7 @@ private void recoverContainer(RecoveredContainerState rcs) YarnServerSecurityUtils.parseCredentials(launchContext); Container container = new ContainerImpl(getConfig(), dispatcher, req.getContainerLaunchContext(), - credentials, metrics, token, rcs.getStatus(), rcs.getExitCode(), - rcs.getDiagnostics(), rcs.getKilled(), rcs.getCapability(), context); + credentials, metrics, token, context, rcs); context.getContainers().put(containerId, container); dispatcher.getEventHandler().handle( new ApplicationContainerInitEvent(container)); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java index 1d2ec5687b..7571964d93 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java @@ -55,6 +55,18 @@ public interface Container extends EventHandler { NMContainerStatus getNMContainerStatus(); + boolean isRetryContextSet(); + + boolean shouldRetry(int errorCode); + + String getWorkDir(); + + void setWorkDir(String workDir); + + String getLogDir(); + + void setLogDir(String logDir); + String toString(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index 676721435c..b1ddc2ef95 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -41,6 +41,8 @@ import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.ContainerRetryContext; +import org.apache.hadoop.yarn.api.records.ContainerRetryPolicy; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; @@ -50,6 +52,7 @@ import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; +import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode; import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger; import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger.AuditConstants; import org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServicesEvent; @@ -71,6 +74,7 @@ import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; +import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredContainerState; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredContainerStatus; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.state.InvalidStateTransitionException; @@ -98,11 +102,17 @@ public class ContainerImpl implements Container { private final String user; private int exitCode = ContainerExitStatus.INVALID; private final StringBuilder diagnostics; + private final int diagnosticsMaxSize; private boolean wasLaunched; private long containerLocalizationStartTime; private long containerLaunchStartTime; private ContainerMetrics containerMetrics; private static Clock clock = SystemClock.getInstance(); + private final ContainerRetryContext containerRetryContext; + // remaining retries to relaunch container if needed + private int remainingRetryAttempts; + private String workDir; + private String logDir; /** The NM-wide configuration - not specific to this container */ private final Configuration daemonConf; @@ -138,6 +148,16 @@ public ContainerImpl(Configuration conf, Dispatcher dispatcher, this.dispatcher = dispatcher; this.stateStore = context.getNMStateStore(); this.launchContext = launchContext; + if (launchContext != null + && launchContext.getContainerRetryContext() != null) { + this.containerRetryContext = launchContext.getContainerRetryContext(); + } else { + this.containerRetryContext = ContainerRetryContext.NEVER_RETRY_CONTEXT; + } + this.remainingRetryAttempts = containerRetryContext.getMaxRetries(); + this.diagnosticsMaxSize = conf.getInt( + YarnConfiguration.NM_CONTAINER_DIAGNOSTICS_MAXIMUM_SIZE, + YarnConfiguration.DEFAULT_NM_CONTAINER_DIAGNOSTICS_MAXIMUM_SIZE); this.containerTokenIdentifier = containerTokenIdentifier; this.containerId = containerTokenIdentifier.getContainerID(); this.resource = containerTokenIdentifier.getResource(); @@ -172,22 +192,24 @@ public ContainerImpl(Configuration conf, Dispatcher dispatcher, public ContainerImpl(Configuration conf, Dispatcher dispatcher, ContainerLaunchContext launchContext, Credentials creds, NodeManagerMetrics metrics, - ContainerTokenIdentifier containerTokenIdentifier, - RecoveredContainerStatus recoveredStatus, int exitCode, - String diagnostics, boolean wasKilled, Resource recoveredCapability, - Context context) { + ContainerTokenIdentifier containerTokenIdentifier, Context context, + RecoveredContainerState rcs) { this(conf, dispatcher, launchContext, creds, metrics, containerTokenIdentifier, context); - this.recoveredStatus = recoveredStatus; - this.exitCode = exitCode; - this.recoveredAsKilled = wasKilled; - this.diagnostics.append(diagnostics); + this.recoveredStatus = rcs.getStatus(); + this.exitCode = rcs.getExitCode(); + this.recoveredAsKilled = rcs.getKilled(); + this.diagnostics.append(rcs.getDiagnostics()); + Resource recoveredCapability = rcs.getCapability(); if (recoveredCapability != null && !this.resource.equals(recoveredCapability)) { // resource capability had been updated before NM was down this.resource = Resource.newInstance(recoveredCapability.getMemory(), recoveredCapability.getVirtualCores()); } + this.remainingRetryAttempts = rcs.getRemainingRetryAttempts(); + this.workDir = rcs.getWorkDir(); + this.logDir = rcs.getLogDir(); } private static final ContainerDiagnosticsUpdateTransition UPDATE_DIAGNOSTICS_TRANSITION = @@ -267,9 +289,10 @@ ContainerEventType.KILL_CONTAINER, new KillTransition()) ContainerEventType.CONTAINER_EXITED_WITH_SUCCESS, new ExitedWithSuccessTransition(true)) .addTransition(ContainerState.RUNNING, - ContainerState.EXITED_WITH_FAILURE, + EnumSet.of(ContainerState.RELAUNCHING, + ContainerState.EXITED_WITH_FAILURE), ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, - new ExitedWithFailureTransition(true)) + new RetryFailureTransition()) .addTransition(ContainerState.RUNNING, ContainerState.RUNNING, ContainerEventType.UPDATE_DIAGNOSTICS_MSG, UPDATE_DIAGNOSTICS_TRANSITION) @@ -279,6 +302,19 @@ ContainerEventType.KILL_CONTAINER, new KillTransition()) ContainerEventType.CONTAINER_KILLED_ON_REQUEST, new KilledExternallyTransition()) + // From RELAUNCHING State + .addTransition(ContainerState.RELAUNCHING, ContainerState.RUNNING, + ContainerEventType.CONTAINER_LAUNCHED, new LaunchTransition()) + .addTransition(ContainerState.RELAUNCHING, + ContainerState.EXITED_WITH_FAILURE, + ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, + new ExitedWithFailureTransition(true)) + .addTransition(ContainerState.RELAUNCHING, ContainerState.RELAUNCHING, + ContainerEventType.UPDATE_DIAGNOSTICS_MSG, + UPDATE_DIAGNOSTICS_TRANSITION) + .addTransition(ContainerState.RELAUNCHING, ContainerState.KILLING, + ContainerEventType.KILL_CONTAINER, new KillTransition()) + // From CONTAINER_EXITED_WITH_SUCCESS State .addTransition(ContainerState.EXITED_WITH_SUCCESS, ContainerState.DONE, ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP, @@ -382,6 +418,7 @@ public org.apache.hadoop.yarn.api.records.ContainerState getCurrentState() { case LOCALIZATION_FAILED: case LOCALIZED: case RUNNING: + case RELAUNCHING: case EXITED_WITH_SUCCESS: case EXITED_WITH_FAILURE: case KILLING: @@ -408,7 +445,8 @@ public String getUser() { public Map> getLocalizedResources() { this.readLock.lock(); try { - if (ContainerState.LOCALIZED == getContainerState()) { + if (ContainerState.LOCALIZED == getContainerState() + || ContainerState.RELAUNCHING == getContainerState()) { return localizedResources; } else { return null; @@ -501,6 +539,26 @@ public ContainerTokenIdentifier getContainerTokenIdentifier() { } } + @Override + public String getWorkDir() { + return workDir; + } + + @Override + public void setWorkDir(String workDir) { + this.workDir = workDir; + } + + @Override + public String getLogDir() { + return logDir; + } + + @Override + public void setLogDir(String logDir) { + this.logDir = logDir; + } + @SuppressWarnings("unchecked") private void sendFinishedEvents() { // Inform the application @@ -527,6 +585,14 @@ private void sendLaunchEvent() { new ContainersLauncherEvent(this, launcherEvent)); } + @SuppressWarnings("unchecked") // dispatcher not typed + private void sendRelaunchEvent() { + ContainersLauncherEventType launcherEvent = + ContainersLauncherEventType.RELAUNCH_CONTAINER; + dispatcher.getEventHandler().handle( + new ContainersLauncherEvent(this, launcherEvent)); + } + // Inform the ContainersMonitor to start monitoring the container's // resource usage. @SuppressWarnings("unchecked") // dispatcher not typed @@ -552,6 +618,9 @@ private void addDiagnostics(String... diags) { for (String s : diags) { this.diagnostics.append(s); } + if (isRetryContextSet() && diagnostics.length() > diagnosticsMaxSize) { + diagnostics.delete(0, diagnostics.length() - diagnosticsMaxSize); + } try { stateStore.storeContainerDiagnostics(containerId, diagnostics); } catch (IOException e) { @@ -876,6 +945,100 @@ public void transition(ContainerImpl container, ContainerEvent event) { } } + /** + * Transition to EXITED_WITH_FAILURE or LOCALIZED state upon + * CONTAINER_EXITED_WITH_FAILURE state. + **/ + @SuppressWarnings("unchecked") // dispatcher not typed + static class RetryFailureTransition implements + MultipleArcTransition { + + @Override + public ContainerState transition(final ContainerImpl container, + ContainerEvent event) { + ContainerExitEvent exitEvent = (ContainerExitEvent) event; + container.exitCode = exitEvent.getExitCode(); + if (exitEvent.getDiagnosticInfo() != null) { + if (container.containerRetryContext.getRetryPolicy() + != ContainerRetryPolicy.NEVER_RETRY) { + int n = container.containerRetryContext.getMaxRetries() + - container.remainingRetryAttempts; + container.addDiagnostics("Diagnostic message from attempt " + + n + " : ", "\n"); + } + container.addDiagnostics(exitEvent.getDiagnosticInfo(), "\n"); + } + + if (container.shouldRetry(container.exitCode)) { + if (container.remainingRetryAttempts > 0) { + container.remainingRetryAttempts--; + try { + container.stateStore.storeContainerRemainingRetryAttempts( + container.getContainerId(), container.remainingRetryAttempts); + } catch (IOException e) { + LOG.warn( + "Unable to update remainingRetryAttempts in state store for " + + container.getContainerId(), e); + } + } + LOG.info("Relaunching Container " + container.getContainerId() + + ". Remaining retry attempts(after relaunch) : " + + container.remainingRetryAttempts + + ". Interval between retries is " + + container.containerRetryContext.getRetryInterval() + "ms"); + container.wasLaunched = false; + container.metrics.endRunningContainer(); + if (container.containerRetryContext.getRetryInterval() == 0) { + container.sendRelaunchEvent(); + } else { + // wait for some time, then send launch event + new Thread() { + @Override + public void run() { + try { + Thread.sleep( + container.containerRetryContext.getRetryInterval()); + container.sendRelaunchEvent(); + } catch (InterruptedException e) { + return; + } + } + }.start(); + } + return ContainerState.RELAUNCHING; + } else { + new ExitedWithFailureTransition(true).transition(container, event); + return ContainerState.EXITED_WITH_FAILURE; + } + } + } + + @Override + public boolean isRetryContextSet() { + return containerRetryContext.getRetryPolicy() + != ContainerRetryPolicy.NEVER_RETRY; + } + + @Override + public boolean shouldRetry(int errorCode) { + if (errorCode == ExitCode.SUCCESS.getExitCode() + || errorCode == ExitCode.FORCE_KILLED.getExitCode() + || errorCode == ExitCode.TERMINATED.getExitCode()) { + return false; + } + + ContainerRetryPolicy retryPolicy = containerRetryContext.getRetryPolicy(); + if (retryPolicy == ContainerRetryPolicy.RETRY_ON_ALL_ERRORS + || (retryPolicy == ContainerRetryPolicy.RETRY_ON_SPECIFIC_ERROR_CODES + && containerRetryContext.getErrorCodes() != null + && containerRetryContext.getErrorCodes().contains(errorCode))) { + return remainingRetryAttempts > 0 + || remainingRetryAttempts == ContainerRetryContext.RETRY_FOREVER; + } + + return false; + } + /** * Transition to EXITED_WITH_FAILURE upon receiving KILLED_ON_REQUEST */ diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerState.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerState.java index a43df89287..6b96204a9e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerState.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerState.java @@ -19,7 +19,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.container; public enum ContainerState { - NEW, LOCALIZING, LOCALIZATION_FAILED, LOCALIZED, RUNNING, EXITED_WITH_SUCCESS, - EXITED_WITH_FAILURE, KILLING, CONTAINER_CLEANEDUP_AFTER_KILL, - CONTAINER_RESOURCES_CLEANINGUP, DONE + NEW, LOCALIZING, LOCALIZATION_FAILED, LOCALIZED, RUNNING, RELAUNCHING, + EXITED_WITH_SUCCESS, EXITED_WITH_FAILURE, KILLING, + CONTAINER_CLEANEDUP_AFTER_KILL, CONTAINER_RESOURCES_CLEANINGUP, DONE } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java index 76ee90e576..a3b53e35e4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java @@ -58,6 +58,7 @@ import org.apache.hadoop.yarn.api.records.SignalContainerCommand; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.Dispatcher; +import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.ipc.RPCUtil; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.DelayedProcessKiller; @@ -98,7 +99,7 @@ public class ContainerLaunch implements Callable { protected final Dispatcher dispatcher; protected final ContainerExecutor exec; - private final Application app; + protected final Application app; protected final Container container; private final Configuration conf; private final Context context; @@ -112,7 +113,7 @@ public class ContainerLaunch implements Callable { protected Path pidFilePath = null; - private final LocalDirsHandlerService dirsHandler; + protected final LocalDirsHandlerService dirsHandler; public ContainerLaunch(Context context, Configuration configuration, Dispatcher dispatcher, ContainerExecutor exec, Application app, @@ -156,33 +157,19 @@ public static String expandEnvironment(String var, @Override @SuppressWarnings("unchecked") // dispatcher not typed public Integer call() { + if (!validateContainerState()) { + return 0; + } + final ContainerLaunchContext launchContext = container.getLaunchContext(); - Map> localResources = null; ContainerId containerID = container.getContainerId(); String containerIdStr = ConverterUtils.toString(containerID); final List command = launchContext.getCommands(); int ret = -1; - // CONTAINER_KILLED_ON_REQUEST should not be missed if the container - // is already at KILLING - if (container.getContainerState() == ContainerState.KILLING) { - dispatcher.getEventHandler().handle( - new ContainerExitEvent(containerID, - ContainerEventType.CONTAINER_KILLED_ON_REQUEST, - Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() : - ExitCode.TERMINATED.getExitCode(), - "Container terminated before launch.")); - return 0; - } - Path containerLogDir; try { - localResources = container.getLocalizedResources(); - if (localResources == null) { - throw RPCUtil.getRemoteException( - "Unable to get local resources when Container " + containerID + - " is at " + container.getContainerState()); - } + Map> localResources = getLocalizedResources(); final String user = container.getUser(); // /////////////////////////// Variable expansion @@ -193,6 +180,7 @@ public Integer call() { .getRelativeContainerLogDir(appIdStr, containerIdStr); containerLogDir = dirsHandler.getLogPathForWrite(relativeContainerLogDir, false); + recordContainerLogDir(containerID, containerLogDir.toString()); for (String str : command) { // TODO: Should we instead work via symlinks without this grammar? newCmds.add(expandEnvironment(str, containerLogDir)); @@ -233,6 +221,7 @@ public Integer call() { + ContainerLocalizer.APPCACHE + Path.SEPARATOR + appIdStr + Path.SEPARATOR + containerIdStr, LocalDirAllocator.SIZE_UNKNOWN, false); + recordContainerWorkDir(containerID, containerWorkDir.toString()); String pidFileSubpath = getPidFileSubpath(appIdStr, containerIdStr); @@ -241,11 +230,8 @@ public Integer call() { pidFilePath = dirsHandler.getLocalPathForWrite(pidFileSubpath); List localDirs = dirsHandler.getLocalDirs(); List logDirs = dirsHandler.getLogDirs(); - - List containerLogDirs = new ArrayList(); - for( String logDir : logDirs) { - containerLogDirs.add(logDir + Path.SEPARATOR + relativeContainerLogDir); - } + List containerLocalDirs = getContainerLocalDirs(localDirs); + List containerLogDirs = getContainerLogDirs(logDirs); if (!dirsHandler.areDisksHealthy()) { ret = ContainerExitStatus.DISKS_FAILED; @@ -253,7 +239,6 @@ public Integer call() { + dirsHandler.getDisksHealthReport(false)); } - List containerLocalDirs = new ArrayList<>(localDirs.size()); try { // /////////// Write out the container-script in the nmPrivate space. List appDirs = new ArrayList(localDirs.size()); @@ -262,14 +247,6 @@ public Integer call() { Path userdir = new Path(usersdir, user); Path appsdir = new Path(userdir, ContainerLocalizer.APPCACHE); appDirs.add(new Path(appsdir, appIdStr)); - - String containerLocalDir = localDir + Path.SEPARATOR + - ContainerLocalizer.USERCACHE + Path.SEPARATOR + user - + Path.SEPARATOR - + ContainerLocalizer.APPCACHE + Path.SEPARATOR + appIdStr - + Path.SEPARATOR; - - containerLocalDirs.add(containerLocalDir); } containerScriptOutStream = lfs.create(nmPrivateContainerScriptPath, @@ -301,35 +278,19 @@ public Integer call() { IOUtils.cleanup(LOG, containerScriptOutStream, tokensOutStream); } - // LaunchContainer is a blocking call. We are here almost means the - // container is launched, so send out the event. - dispatcher.getEventHandler().handle(new ContainerEvent( - containerID, - ContainerEventType.CONTAINER_LAUNCHED)); - context.getNMStateStore().storeContainerLaunched(containerID); - - // Check if the container is signalled to be killed. - if (!shouldLaunchContainer.compareAndSet(false, true)) { - LOG.info("Container " + containerIdStr + " not launched as " - + "cleanup already called"); - ret = ExitCode.TERMINATED.getExitCode(); - } - else { - exec.activateContainer(containerID, pidFilePath); - ret = exec.launchContainer(new ContainerStartContext.Builder() - .setContainer(container) - .setLocalizedResources(localResources) - .setNmPrivateContainerScriptPath(nmPrivateContainerScriptPath) - .setNmPrivateTokensPath(nmPrivateTokensPath) - .setUser(user) - .setAppId(appIdStr) - .setContainerWorkDir(containerWorkDir) - .setLocalDirs(localDirs) - .setLogDirs(logDirs) - .setContainerLocalDirs(containerLocalDirs) - .setContainerLogDirs(containerLogDirs) - .build()); - } + ret = launchContainer(new ContainerStartContext.Builder() + .setContainer(container) + .setLocalizedResources(localResources) + .setNmPrivateContainerScriptPath(nmPrivateContainerScriptPath) + .setNmPrivateTokensPath(nmPrivateTokensPath) + .setUser(user) + .setAppId(appIdStr) + .setContainerWorkDir(containerWorkDir) + .setLocalDirs(localDirs) + .setLogDirs(logDirs) + .setContainerLocalDirs(containerLocalDirs) + .setContainerLogDirs(containerLogDirs) + .build()); } catch (Throwable e) { LOG.warn("Failed to launch container.", e); dispatcher.getEventHandler().handle(new ContainerExitEvent( @@ -337,46 +298,138 @@ public Integer call() { e.getMessage())); return ret; } finally { - completed.set(true); - exec.deactivateContainer(containerID); - try { - context.getNMStateStore().storeContainerCompleted(containerID, ret); - } catch (IOException e) { - LOG.error("Unable to set exit code for container " + containerID); - } + setContainerCompletedStatus(ret); } + handleContainerExitCode(ret, containerLogDir); + + return ret; + } + + @SuppressWarnings("unchecked") + protected boolean validateContainerState() { + // CONTAINER_KILLED_ON_REQUEST should not be missed if the container + // is already at KILLING + if (container.getContainerState() == ContainerState.KILLING) { + dispatcher.getEventHandler().handle( + new ContainerExitEvent(container.getContainerId(), + ContainerEventType.CONTAINER_KILLED_ON_REQUEST, + Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() : + ExitCode.TERMINATED.getExitCode(), + "Container terminated before launch.")); + return false; + } + + return true; + } + + protected List getContainerLogDirs(List logDirs) { + List containerLogDirs = new ArrayList<>(logDirs.size()); + String appIdStr = app.getAppId().toString(); + String containerIdStr = ConverterUtils.toString(container.getContainerId()); + String relativeContainerLogDir = ContainerLaunch + .getRelativeContainerLogDir(appIdStr, containerIdStr); + + for(String logDir : logDirs) { + containerLogDirs.add(logDir + Path.SEPARATOR + relativeContainerLogDir); + } + + return containerLogDirs; + } + + protected List getContainerLocalDirs(List localDirs) { + List containerLocalDirs = new ArrayList<>(localDirs.size()); + String user = container.getUser(); + String appIdStr = app.getAppId().toString(); + String relativeContainerLocalDir = ContainerLocalizer.USERCACHE + + Path.SEPARATOR + user + Path.SEPARATOR + ContainerLocalizer.APPCACHE + + Path.SEPARATOR + appIdStr + Path.SEPARATOR; + + for (String localDir : localDirs) { + containerLocalDirs.add(localDir + Path.SEPARATOR + + relativeContainerLocalDir); + } + + return containerLocalDirs; + } + + protected Map> getLocalizedResources() + throws YarnException { + Map> localResources = container.getLocalizedResources(); + if (localResources == null) { + throw RPCUtil.getRemoteException( + "Unable to get local resources when Container " + container + + " is at " + container.getContainerState()); + } + return localResources; + } + + @SuppressWarnings("unchecked") + protected int launchContainer(ContainerStartContext ctx) throws IOException { + ContainerId containerId = container.getContainerId(); + + // LaunchContainer is a blocking call. We are here almost means the + // container is launched, so send out the event. + dispatcher.getEventHandler().handle(new ContainerEvent( + containerId, + ContainerEventType.CONTAINER_LAUNCHED)); + context.getNMStateStore().storeContainerLaunched(containerId); + + // Check if the container is signalled to be killed. + if (!shouldLaunchContainer.compareAndSet(false, true)) { + LOG.info("Container " + containerId + " not launched as " + + "cleanup already called"); + return ExitCode.TERMINATED.getExitCode(); + } else { + exec.activateContainer(containerId, pidFilePath); + return exec.launchContainer(ctx); + } + } + + protected void setContainerCompletedStatus(int exitCode) { + ContainerId containerId = container.getContainerId(); + completed.set(true); + exec.deactivateContainer(containerId); + try { + if (!container.shouldRetry(exitCode)) { + context.getNMStateStore().storeContainerCompleted(containerId, + exitCode); + } + } catch (IOException e) { + LOG.error("Unable to set exit code for container " + containerId); + } + } + + @SuppressWarnings("unchecked") + protected void handleContainerExitCode(int exitCode, Path containerLogDir) { + ContainerId containerId = container.getContainerId(); + if (LOG.isDebugEnabled()) { - LOG.debug("Container " + containerIdStr + " completed with exit code " - + ret); + LOG.debug("Container " + containerId + " completed with exit code " + + exitCode); } StringBuilder diagnosticInfo = new StringBuilder("Container exited with a non-zero exit code "); - diagnosticInfo.append(ret); + diagnosticInfo.append(exitCode); diagnosticInfo.append(". "); - if (ret == ExitCode.FORCE_KILLED.getExitCode() - || ret == ExitCode.TERMINATED.getExitCode()) { + if (exitCode == ExitCode.FORCE_KILLED.getExitCode() + || exitCode == ExitCode.TERMINATED.getExitCode()) { // If the process was killed, Send container_cleanedup_after_kill and // just break out of this method. dispatcher.getEventHandler().handle( - new ContainerExitEvent(containerID, - ContainerEventType.CONTAINER_KILLED_ON_REQUEST, ret, - diagnosticInfo.toString())); - return ret; - } - - if (ret != 0) { - handleContainerExitWithFailure(containerID, ret, containerLogDir, + new ContainerExitEvent(containerId, + ContainerEventType.CONTAINER_KILLED_ON_REQUEST, exitCode, + diagnosticInfo.toString())); + } else if (exitCode != 0) { + handleContainerExitWithFailure(containerId, exitCode, containerLogDir, diagnosticInfo); - return ret; + } else { + LOG.info("Container " + containerId + " succeeded "); + dispatcher.getEventHandler().handle( + new ContainerEvent(containerId, + ContainerEventType.CONTAINER_EXITED_WITH_SUCCESS)); } - - LOG.info("Container " + containerIdStr + " succeeded "); - dispatcher.getEventHandler().handle( - new ContainerEvent(containerID, - ContainerEventType.CONTAINER_EXITED_WITH_SUCCESS)); - return 0; } /** @@ -389,8 +442,8 @@ public Integer call() { * @param diagnosticInfo */ @SuppressWarnings("unchecked") - private void handleContainerExitWithFailure(ContainerId containerID, int ret, - Path containerLogDir, StringBuilder diagnosticInfo) { + protected void handleContainerExitWithFailure(ContainerId containerID, + int ret, Path containerLogDir, StringBuilder diagnosticInfo) { LOG.warn(diagnosticInfo); String errorFileNamePattern = @@ -689,7 +742,8 @@ public static String getRelativeContainerLogDir(String appIdStr, return appIdStr + Path.SEPARATOR + containerIdStr; } - private String getContainerPrivateDir(String appIdStr, String containerIdStr) { + protected String getContainerPrivateDir(String appIdStr, + String containerIdStr) { return getAppPrivateDir(appIdStr) + Path.SEPARATOR + containerIdStr + Path.SEPARATOR; } @@ -1106,4 +1160,20 @@ public void sanitizeEnv(Map environment, Path pwd, public static String getExitCodeFile(String pidFile) { return pidFile + EXIT_CODE_FILE_SUFFIX; } + + private void recordContainerLogDir(ContainerId containerId, + String logDir) throws IOException{ + if (container.isRetryContextSet()) { + container.setLogDir(logDir); + context.getNMStateStore().storeContainerLogDir(containerId, logDir); + } + } + + private void recordContainerWorkDir(ContainerId containerId, + String workDir) throws IOException{ + if (container.isRetryContextSet()) { + container.setWorkDir(workDir); + context.getNMStateStore().storeContainerWorkDir(containerId, workDir); + } + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerRelaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerRelaunch.java new file mode 100644 index 0000000000..711d5cdc26 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerRelaunch.java @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.yarn.api.records.ContainerExitStatus; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.event.Dispatcher; +import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer; +import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerStartContext; +import org.apache.hadoop.yarn.server.nodemanager.executor.DeletionAsUserContext; +import org.apache.hadoop.yarn.util.ConverterUtils; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +/** + * Relaunch container. + */ +public class ContainerRelaunch extends ContainerLaunch { + + private static final Log LOG = LogFactory.getLog(ContainerRelaunch.class); + + public ContainerRelaunch(Context context, Configuration configuration, + Dispatcher dispatcher, ContainerExecutor exec, Application app, + Container container, LocalDirsHandlerService dirsHandler, + ContainerManagerImpl containerManager) { + super(context, configuration, dispatcher, exec, app, container, dirsHandler, + containerManager); + } + + @Override + @SuppressWarnings("unchecked") + public Integer call() { + if (!validateContainerState()) { + return 0; + } + + ContainerId containerId = container.getContainerId(); + String containerIdStr = ConverterUtils.toString(containerId); + int ret = -1; + Path containerLogDir; + try { + Path containerWorkDir = getContainerWorkDir(); + cleanupPreviousContainerFiles(containerWorkDir); + + containerLogDir = getContainerLogDir(); + + Map> localResources = getLocalizedResources(); + + String appIdStr = app.getAppId().toString(); + Path nmPrivateContainerScriptPath = + getNmPrivateContainerScriptPath(appIdStr, containerIdStr); + Path nmPrivateTokensPath = + getNmPrivateTokensPath(appIdStr, containerIdStr); + pidFilePath = getPidFilePath(appIdStr, containerIdStr); + + LOG.info("Relaunch container with " + + "workDir = " + containerWorkDir.toString() + + ", logDir = " + containerLogDir.toString() + + ", nmPrivateContainerScriptPath = " + + nmPrivateContainerScriptPath.toString() + + ", nmPrivateTokensPath = " + nmPrivateTokensPath.toString() + + ", pidFilePath = " + pidFilePath.toString()); + + List localDirs = dirsHandler.getLocalDirs(); + List logDirs = dirsHandler.getLogDirs(); + List containerLocalDirs = getContainerLocalDirs(localDirs); + List containerLogDirs = getContainerLogDirs(logDirs); + + if (!dirsHandler.areDisksHealthy()) { + ret = ContainerExitStatus.DISKS_FAILED; + throw new IOException("Most of the disks failed. " + + dirsHandler.getDisksHealthReport(false)); + } + + ret = launchContainer(new ContainerStartContext.Builder() + .setContainer(container) + .setLocalizedResources(localResources) + .setNmPrivateContainerScriptPath(nmPrivateContainerScriptPath) + .setNmPrivateTokensPath(nmPrivateTokensPath) + .setUser(container.getUser()) + .setAppId(appIdStr) + .setContainerWorkDir(containerWorkDir) + .setLocalDirs(localDirs) + .setLogDirs(logDirs) + .setContainerLocalDirs(containerLocalDirs) + .setContainerLogDirs(containerLogDirs) + .build()); + } catch (Throwable e) { + LOG.warn("Failed to relaunch container.", e); + dispatcher.getEventHandler().handle(new ContainerExitEvent( + containerId, ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret, + e.getMessage())); + return ret; + } finally { + setContainerCompletedStatus(ret); + } + + handleContainerExitCode(ret, containerLogDir); + + return ret; + } + + private Path getContainerWorkDir() throws IOException { + String containerWorkDir = container.getWorkDir(); + if (containerWorkDir == null + || !dirsHandler.isGoodLocalDir(containerWorkDir)) { + throw new IOException( + "Could not find a good work dir " + containerWorkDir + + " for container " + container); + } + + return new Path(containerWorkDir); + } + + private Path getContainerLogDir() throws IOException { + String containerLogDir = container.getLogDir(); + if (containerLogDir == null || !dirsHandler.isGoodLogDir(containerLogDir)) { + throw new IOException("Could not find a good log dir " + containerLogDir + + " for container " + container); + } + + return new Path(containerLogDir); + } + + private Path getNmPrivateContainerScriptPath(String appIdStr, + String containerIdStr) throws IOException { + return dirsHandler.getLocalPathForRead( + getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR + + CONTAINER_SCRIPT); + } + + private Path getNmPrivateTokensPath(String appIdStr, + String containerIdStr) throws IOException { + return dirsHandler.getLocalPathForRead( + getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR + + String.format(ContainerLocalizer.TOKEN_FILE_NAME_FMT, + containerIdStr)); + } + + private Path getPidFilePath(String appIdStr, + String containerIdStr) throws IOException { + return dirsHandler.getLocalPathForRead( + getPidFileSubpath(appIdStr, containerIdStr)); + } + + /** + * Clean up container's previous files for container relaunch. + */ + private void cleanupPreviousContainerFiles(Path containerWorkDir) { + // delete ContainerScriptPath + deleteAsUser(new Path(containerWorkDir, CONTAINER_SCRIPT)); + // delete TokensPath + deleteAsUser(new Path(containerWorkDir, FINAL_CONTAINER_TOKENS_FILE)); + } + + private void deleteAsUser(Path path) { + try { + exec.deleteAsUser(new DeletionAsUserContext.Builder() + .setUser(container.getUser()) + .setSubDir(path) + .build()); + } catch (Exception e) { + LOG.warn("Failed to delete " + path, e); + } + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java index a34051ce44..e5fff00fbe 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java @@ -118,6 +118,16 @@ public void handle(ContainersLauncherEvent event) { containerLauncher.submit(launch); running.put(containerId, launch); break; + case RELAUNCH_CONTAINER: + app = context.getApplications().get( + containerId.getApplicationAttemptId().getApplicationId()); + + ContainerRelaunch relaunch = + new ContainerRelaunch(context, getConfig(), dispatcher, exec, app, + event.getContainer(), dirsHandler, containerManager); + containerLauncher.submit(relaunch); + running.put(containerId, relaunch); + break; case RECOVER_CONTAINER: app = context.getApplications().get( containerId.getApplicationAttemptId().getApplicationId()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncherEventType.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncherEventType.java index a88564db85..2d7bc74302 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncherEventType.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncherEventType.java @@ -20,6 +20,7 @@ public enum ContainersLauncherEventType { LAUNCH_CONTAINER, + RELAUNCH_CONTAINER, RECOVER_CONTAINER, CLEANUP_CONTAINER, // The process(grp) itself. SIGNAL_CONTAINER, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java index d74c6a8a6c..6e9efe123c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java @@ -110,6 +110,10 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { "/resourceChanged"; private static final String CONTAINER_KILLED_KEY_SUFFIX = "/killed"; private static final String CONTAINER_EXIT_CODE_KEY_SUFFIX = "/exitcode"; + private static final String CONTAINER_REMAIN_RETRIES_KEY_SUFFIX = + "/remainingRetryAttempts"; + private static final String CONTAINER_WORK_DIR_KEY_SUFFIX = "/workdir"; + private static final String CONTAINER_LOG_DIR_KEY_SUFFIX = "/logdir"; private static final String CURRENT_MASTER_KEY_SUFFIX = "CurrentMasterKey"; private static final String PREV_MASTER_KEY_SUFFIX = "PreviousMasterKey"; @@ -247,6 +251,13 @@ private RecoveredContainerState loadContainerState(ContainerId containerId, } else if (suffix.equals(CONTAINER_RESOURCE_CHANGED_KEY_SUFFIX)) { rcs.capability = new ResourcePBImpl( ResourceProto.parseFrom(entry.getValue())); + } else if (suffix.equals(CONTAINER_REMAIN_RETRIES_KEY_SUFFIX)) { + rcs.setRemainingRetryAttempts( + Integer.parseInt(asString(entry.getValue()))); + } else if (suffix.equals(CONTAINER_WORK_DIR_KEY_SUFFIX)) { + rcs.setWorkDir(asString(entry.getValue())); + } else if (suffix.equals(CONTAINER_LOG_DIR_KEY_SUFFIX)) { + rcs.setLogDir(asString(entry.getValue())); } else { throw new IOException("Unexpected container state key: " + key); } @@ -356,6 +367,42 @@ public void storeContainerCompleted(ContainerId containerId, } } + @Override + public void storeContainerRemainingRetryAttempts(ContainerId containerId, + int remainingRetryAttempts) throws IOException { + String key = CONTAINERS_KEY_PREFIX + containerId.toString() + + CONTAINER_REMAIN_RETRIES_KEY_SUFFIX; + try { + db.put(bytes(key), bytes(Integer.toString(remainingRetryAttempts))); + } catch (DBException e) { + throw new IOException(e); + } + } + + @Override + public void storeContainerWorkDir(ContainerId containerId, + String workDir) throws IOException { + String key = CONTAINERS_KEY_PREFIX + containerId.toString() + + CONTAINER_WORK_DIR_KEY_SUFFIX; + try { + db.put(bytes(key), bytes(workDir)); + } catch (DBException e) { + throw new IOException(e); + } + } + + @Override + public void storeContainerLogDir(ContainerId containerId, + String logDir) throws IOException { + String key = CONTAINERS_KEY_PREFIX + containerId.toString() + + CONTAINER_LOG_DIR_KEY_SUFFIX; + try { + db.put(bytes(key), bytes(logDir)); + } catch (DBException e) { + throw new IOException(e); + } + } + @Override public void removeContainer(ContainerId containerId) throws IOException { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java index a887e71e9e..08b80e961a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java @@ -99,6 +99,21 @@ public void storeContainerCompleted(ContainerId containerId, int exitCode) throws IOException { } + @Override + public void storeContainerRemainingRetryAttempts(ContainerId containerId, + int remainingRetryAttempts) throws IOException { + } + + @Override + public void storeContainerWorkDir(ContainerId containerId, + String workDir) throws IOException { + } + + @Override + public void storeContainerLogDir(ContainerId containerId, + String logDir) throws IOException { + } + @Override public void removeContainer(ContainerId containerId) throws IOException { } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java index 463815ec9c..ccf1e709d9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java @@ -34,6 +34,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.ContainerRetryContext; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.proto.YarnProtos.LocalResourceProto; import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.ContainerManagerApplicationProto; @@ -72,6 +73,9 @@ public static class RecoveredContainerState { String diagnostics = ""; StartContainerRequest startRequest; Resource capability; + private int remainingRetryAttempts = ContainerRetryContext.RETRY_INVALID; + private String workDir; + private String logDir; public RecoveredContainerStatus getStatus() { return status; @@ -97,6 +101,30 @@ public Resource getCapability() { return capability; } + public int getRemainingRetryAttempts() { + return remainingRetryAttempts; + } + + public void setRemainingRetryAttempts(int retryAttempts) { + this.remainingRetryAttempts = retryAttempts; + } + + public String getWorkDir() { + return workDir; + } + + public void setWorkDir(String workDir) { + this.workDir = workDir; + } + + public String getLogDir() { + return logDir; + } + + public void setLogDir(String logDir) { + this.logDir = logDir; + } + @Override public String toString() { return new StringBuffer("Status: ").append(getStatus()) @@ -105,6 +133,9 @@ public String toString() { .append(", Diagnostics: ").append(getDiagnostics()) .append(", Capability: ").append(getCapability()) .append(", StartRequest: ").append(getStartRequest()) + .append(", RemainingRetryAttempts: ").append(remainingRetryAttempts) + .append(", WorkDir: ").append(workDir) + .append(", LogDir: ").append(logDir) .toString(); } } @@ -323,6 +354,34 @@ public abstract void storeContainerKilled(ContainerId containerId) public abstract void storeContainerDiagnostics(ContainerId containerId, StringBuilder diagnostics) throws IOException; + /** + * Record remaining retry attempts for a container. + * @param containerId the container ID + * @param remainingRetryAttempts the remain retry times when container + * fails to run + * @throws IOException + */ + public abstract void storeContainerRemainingRetryAttempts( + ContainerId containerId, int remainingRetryAttempts) throws IOException; + + /** + * Record working directory for a container. + * @param containerId the container ID + * @param workDir the working directory + * @throws IOException + */ + public abstract void storeContainerWorkDir( + ContainerId containerId, String workDir) throws IOException; + + /** + * Record log directory for a container. + * @param containerId the container ID + * @param logDir the log directory + * @throws IOException + */ + public abstract void storeContainerLogDir( + ContainerId containerId, String logDir) throws IOException; + /** * Remove records corresponding to a container * @param containerId the container ID diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java index cc98bdc54d..118bc42bcb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java @@ -54,6 +54,8 @@ import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.ContainerRetryContext; +import org.apache.hadoop.yarn.api.records.ContainerRetryPolicy; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.hadoop.yarn.api.records.LocalResourceType; @@ -660,6 +662,69 @@ public void testLaunchAfterKillRequest() throws Exception { } } + @Test + public void testContainerRetry() throws Exception{ + ContainerRetryContext containerRetryContext1 = ContainerRetryContext + .newInstance(ContainerRetryPolicy.NEVER_RETRY, null, 3, 0); + testContainerRetry(containerRetryContext1, 2, 0); + + ContainerRetryContext containerRetryContext2 = ContainerRetryContext + .newInstance(ContainerRetryPolicy.RETRY_ON_ALL_ERRORS, null, 3, 0); + testContainerRetry(containerRetryContext2, 2, 3); + + ContainerRetryContext containerRetryContext3 = ContainerRetryContext + .newInstance(ContainerRetryPolicy.RETRY_ON_ALL_ERRORS, null, 3, 0); + // If exit code is 0, it will not retry + testContainerRetry(containerRetryContext3, 0, 0); + + ContainerRetryContext containerRetryContext4 = ContainerRetryContext + .newInstance( + ContainerRetryPolicy.RETRY_ON_SPECIFIC_ERROR_CODES, null, 3, 0); + testContainerRetry(containerRetryContext4, 2, 0); + + HashSet errorCodes = new HashSet<>(); + errorCodes.add(2); + errorCodes.add(6); + ContainerRetryContext containerRetryContext5 = ContainerRetryContext + .newInstance(ContainerRetryPolicy.RETRY_ON_SPECIFIC_ERROR_CODES, + errorCodes, 3, 0); + testContainerRetry(containerRetryContext5, 2, 3); + + HashSet errorCodes2 = new HashSet<>(); + errorCodes.add(143); + ContainerRetryContext containerRetryContext6 = ContainerRetryContext + .newInstance(ContainerRetryPolicy.RETRY_ON_SPECIFIC_ERROR_CODES, + errorCodes2, 3, 0); + // If exit code is 143(SIGTERM), it will not retry even it is in errorCodes. + testContainerRetry(containerRetryContext6, 143, 0); + } + + private void testContainerRetry(ContainerRetryContext containerRetryContext, + int exitCode, int expectedRetries) throws Exception{ + WrappedContainer wc = null; + try { + int retryTimes = 0; + wc = new WrappedContainer(24, 314159265358979L, 4344, "yak", + containerRetryContext); + wc.initContainer(); + wc.localizeResources(); + wc.launchContainer(); + while (true) { + wc.containerFailed(exitCode); + if (wc.c.getContainerState() == ContainerState.RUNNING) { + retryTimes ++; + } else { + break; + } + } + Assert.assertEquals(expectedRetries, retryTimes); + } finally { + if (wc != null) { + wc.finished(); + } + } + } + private void verifyCleanupCall(WrappedContainer wc) throws Exception { ResourcesReleasedMatcher matchesReq = new ResourcesReleasedMatcher(wc.localResources, EnumSet.of( @@ -802,12 +867,23 @@ private class WrappedContainer { WrappedContainer(int appId, long timestamp, int id, String user) throws IOException { - this(appId, timestamp, id, user, true, false); + this(appId, timestamp, id, user, null); + } + + WrappedContainer(int appId, long timestamp, int id, String user, + ContainerRetryContext containerRetryContext) throws IOException { + this(appId, timestamp, id, user, true, false, containerRetryContext); + } + + WrappedContainer(int appId, long timestamp, int id, String user, + boolean withLocalRes, boolean withServiceData) throws IOException { + this(appId, timestamp, id, user, withLocalRes, withServiceData, null); } @SuppressWarnings("rawtypes") WrappedContainer(int appId, long timestamp, int id, String user, - boolean withLocalRes, boolean withServiceData) throws IOException { + boolean withLocalRes, boolean withServiceData, + ContainerRetryContext containerRetryContext) throws IOException { dispatcher = new DrainDispatcher(); dispatcher.init(new Configuration()); @@ -884,6 +960,7 @@ private class WrappedContainer { serviceData = Collections. emptyMap(); } when(ctxt.getServiceData()).thenReturn(serviceData); + when(ctxt.getContainerRetryContext()).thenReturn(containerRetryContext); c = new ContainerImpl(conf, dispatcher, ctxt, null, metrics, identifier, context); @@ -1005,6 +1082,10 @@ public void containerFailed(int exitCode) { assert containerStatus.getDiagnostics().contains(diagnosticMsg); assert containerStatus.getExitStatus() == exitCode; drainDispatcherEvents(); + // If container needs retry, relaunch it + if (c.getContainerState() == ContainerState.RELAUNCHING) { + launchContainer(); + } } public void killContainer() { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java index 1279896339..46522453ff 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java @@ -115,6 +115,9 @@ public synchronized List loadContainersState() rcsCopy.diagnostics = rcs.diagnostics; rcsCopy.startRequest = rcs.startRequest; rcsCopy.capability = rcs.capability; + rcsCopy.setRemainingRetryAttempts(rcs.getRemainingRetryAttempts()); + rcsCopy.setWorkDir(rcs.getWorkDir()); + rcsCopy.setLogDir(rcs.getLogDir()); result.add(rcsCopy); } return result; @@ -167,6 +170,27 @@ public synchronized void storeContainerCompleted(ContainerId containerId, rcs.exitCode = exitCode; } + @Override + public void storeContainerRemainingRetryAttempts(ContainerId containerId, + int remainingRetryAttempts) throws IOException { + RecoveredContainerState rcs = getRecoveredContainerState(containerId); + rcs.setRemainingRetryAttempts(remainingRetryAttempts); + } + + @Override + public void storeContainerWorkDir(ContainerId containerId, + String workDir) throws IOException { + RecoveredContainerState rcs = getRecoveredContainerState(containerId); + rcs.setWorkDir(workDir); + } + + @Override + public void storeContainerLogDir(ContainerId containerId, + String logDir) throws IOException { + RecoveredContainerState rcs = getRecoveredContainerState(containerId); + rcs.setLogDir(logDir); + } + @Override public synchronized void removeContainer(ContainerId containerId) throws IOException { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java index e44e5e51e6..ccc9254afc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java @@ -334,6 +334,18 @@ public void testContainerStorage() throws IOException { assertEquals(containerReq, rcs.getStartRequest()); assertEquals(diags.toString(), rcs.getDiagnostics()); + // store remainingRetryAttempts, workDir and logDir + stateStore.storeContainerRemainingRetryAttempts(containerId, 6); + stateStore.storeContainerWorkDir(containerId, "/test/workdir"); + stateStore.storeContainerLogDir(containerId, "/test/logdir"); + restartStateStore(); + recoveredContainers = stateStore.loadContainersState(); + assertEquals(1, recoveredContainers.size()); + rcs = recoveredContainers.get(0); + assertEquals(6, rcs.getRemainingRetryAttempts()); + assertEquals("/test/workdir", rcs.getWorkDir()); + assertEquals("/test/logdir", rcs.getLogDir()); + // remove the container and verify not recovered stateStore.removeContainer(containerId); restartStateStore(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java index 394a92cb19..0b95dba465 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java @@ -144,4 +144,32 @@ public ContainerTokenIdentifier getContainerTokenIdentifier() { public NMContainerStatus getNMContainerStatus() { return null; } + + @Override + public boolean isRetryContextSet() { + return false; + } + + @Override + public boolean shouldRetry(int errorCode) { + return false; + } + + @Override + public String getWorkDir() { + return null; + } + + @Override + public void setWorkDir(String workDir) { + } + + @Override + public String getLogDir() { + return null; + } + + @Override + public void setLogDir(String logDir) { + } }