YARN-2934. Improve handling of container's stderr. (Naganarasimha G R via gera)
This commit is contained in:
parent
65537845a9
commit
2c17b81569
@ -83,6 +83,9 @@ Release 2.9.0 - UNRELEASED
|
||||
YARN-4156. TestAMRestart#testAMBlacklistPreventsRestartOnSameNode
|
||||
assumes CapacityScheduler. (Anubhav Dhoot via kasha)
|
||||
|
||||
YARN-2934. Improve handling of container's stderr.
|
||||
(Naganarasimha G R via gera)
|
||||
|
||||
Release 2.8.0 - UNRELEASED
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
@ -1333,6 +1333,17 @@ private static void addDeprecatedKeys() {
|
||||
public static final String NM_USER_HOME_DIR =
|
||||
NM_PREFIX + "user-home-dir";
|
||||
|
||||
public static final String NM_CONTAINER_STDERR_PATTERN =
|
||||
NM_PREFIX + "container.stderr.pattern";
|
||||
|
||||
public static final String DEFAULT_NM_CONTAINER_STDERR_PATTERN =
|
||||
"{*stderr*,*STDERR*}";
|
||||
|
||||
public static final String NM_CONTAINER_STDERR_BYTES =
|
||||
NM_PREFIX + "container.stderr.tail.bytes";
|
||||
|
||||
public static final long DEFAULT_NM_CONTAINER_STDERR_BYTES = 4 * 1024;
|
||||
|
||||
/**The kerberos principal to be used for spnego filter for NM.*/
|
||||
public static final String NM_WEBAPP_SPNEGO_USER_NAME_KEY =
|
||||
NM_PREFIX + "webapp.spnego-principal";
|
||||
|
@ -2459,6 +2459,29 @@
|
||||
<value>org.apache.hadoop.yarn.server.nodemanager.amrmproxy.DefaultRequestInterceptor</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
Error filename pattern, to identify the file in the container's
|
||||
Log directory which contain the container's error log. As error file
|
||||
redirection is done by client/AM and yarn will not be aware of the error
|
||||
file name. YARN uses this pattern to identify the error file and tail
|
||||
the error log as diagnostics when the container execution returns non zero
|
||||
value. Filename patterns are case sensitive and should match the
|
||||
specifications of FileSystem.globStatus(Path) api. If multiple filenames
|
||||
matches the pattern, first file matching the pattern will be picked.
|
||||
</description>
|
||||
<name>yarn.nodemanager.container.stderr.pattern</name>
|
||||
<value>{*stderr*,*STDERR*}</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
Size of the container error file which needs to be tailed, in bytes.
|
||||
</description>
|
||||
<name>yarn.nodemanager.container.stderr.tail.bytes </name>
|
||||
<value>4096</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
Enable/disable blacklisting of hosts for AM based on AM failures on those
|
||||
|
@ -26,6 +26,7 @@
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumSet;
|
||||
import java.util.HashMap;
|
||||
@ -38,7 +39,10 @@
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileContext;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.FileUtil;
|
||||
import org.apache.hadoop.fs.LocalDirAllocator;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
@ -61,6 +65,7 @@
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.WindowsSecureContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
@ -71,7 +76,6 @@
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.WindowsSecureContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerSignalContext;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerStartContext;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader;
|
||||
@ -171,6 +175,7 @@ public Integer call() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
Path containerLogDir;
|
||||
try {
|
||||
localResources = container.getLocalizedResources();
|
||||
if (localResources == null) {
|
||||
@ -186,7 +191,7 @@ public Integer call() {
|
||||
String appIdStr = app.getAppId().toString();
|
||||
String relativeContainerLogDir = ContainerLaunch
|
||||
.getRelativeContainerLogDir(appIdStr, containerIdStr);
|
||||
Path containerLogDir =
|
||||
containerLogDir =
|
||||
dirsHandler.getLogPathForWrite(relativeContainerLogDir, false);
|
||||
for (String str : command) {
|
||||
// TODO: Should we instead work via symlinks without this grammar?
|
||||
@ -334,6 +339,11 @@ public Integer call() {
|
||||
LOG.debug("Container " + containerIdStr + " completed with exit code "
|
||||
+ ret);
|
||||
}
|
||||
|
||||
StringBuilder diagnosticInfo =
|
||||
new StringBuilder("Container exited with a non-zero exit code ");
|
||||
diagnosticInfo.append(ret);
|
||||
diagnosticInfo.append(". ");
|
||||
if (ret == ExitCode.FORCE_KILLED.getExitCode()
|
||||
|| ret == ExitCode.TERMINATED.getExitCode()) {
|
||||
// If the process was killed, Send container_cleanedup_after_kill and
|
||||
@ -341,16 +351,13 @@ public Integer call() {
|
||||
dispatcher.getEventHandler().handle(
|
||||
new ContainerExitEvent(containerID,
|
||||
ContainerEventType.CONTAINER_KILLED_ON_REQUEST, ret,
|
||||
"Container exited with a non-zero exit code " + ret));
|
||||
diagnosticInfo.toString()));
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (ret != 0) {
|
||||
LOG.warn("Container exited with a non-zero exit code " + ret);
|
||||
this.dispatcher.getEventHandler().handle(new ContainerExitEvent(
|
||||
containerID,
|
||||
ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret,
|
||||
"Container exited with a non-zero exit code " + ret));
|
||||
handleContainerExitWithFailure(containerID, ret, containerLogDir,
|
||||
diagnosticInfo);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -361,6 +368,78 @@ public Integer call() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to tail and fetch TAIL_SIZE_IN_BYTES of data from the error log.
|
||||
* ErrorLog filename is not fixed and depends upon app, hence file name
|
||||
* pattern is used.
|
||||
* @param containerID
|
||||
* @param ret
|
||||
* @param containerLogDir
|
||||
* @param diagnosticInfo
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
private void handleContainerExitWithFailure(ContainerId containerID, int ret,
|
||||
Path containerLogDir, StringBuilder diagnosticInfo) {
|
||||
LOG.warn(diagnosticInfo);
|
||||
|
||||
String errorFileNamePattern =
|
||||
conf.get(YarnConfiguration.NM_CONTAINER_STDERR_PATTERN,
|
||||
YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_PATTERN);
|
||||
FSDataInputStream errorFileIS = null;
|
||||
try {
|
||||
FileSystem fileSystem = FileSystem.getLocal(conf).getRaw();
|
||||
FileStatus[] errorFileStatuses = fileSystem
|
||||
.globStatus(new Path(containerLogDir, errorFileNamePattern));
|
||||
if (errorFileStatuses != null && errorFileStatuses.length != 0) {
|
||||
long tailSizeInBytes =
|
||||
conf.getLong(YarnConfiguration.NM_CONTAINER_STDERR_BYTES,
|
||||
YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_BYTES);
|
||||
Path errorFile = errorFileStatuses[0].getPath();
|
||||
long fileSize = errorFileStatuses[0].getLen();
|
||||
|
||||
// if more than one file matches the stderr pattern, take the latest
|
||||
// modified file, and also append the file names in the diagnosticInfo
|
||||
if (errorFileStatuses.length > 1) {
|
||||
String[] errorFileNames = new String[errorFileStatuses.length];
|
||||
long latestModifiedTime = errorFileStatuses[0].getModificationTime();
|
||||
errorFileNames[0] = errorFileStatuses[0].getPath().getName();
|
||||
for (int i = 1; i < errorFileStatuses.length; i++) {
|
||||
errorFileNames[i] = errorFileStatuses[i].getPath().getName();
|
||||
if (errorFileStatuses[i]
|
||||
.getModificationTime() > latestModifiedTime) {
|
||||
latestModifiedTime = errorFileStatuses[i].getModificationTime();
|
||||
errorFile = errorFileStatuses[i].getPath();
|
||||
fileSize = errorFileStatuses[i].getLen();
|
||||
}
|
||||
}
|
||||
diagnosticInfo.append("Error files: ")
|
||||
.append(StringUtils.join(", ", errorFileNames)).append(".\n");
|
||||
}
|
||||
|
||||
long startPosition =
|
||||
(fileSize < tailSizeInBytes) ? 0 : fileSize - tailSizeInBytes;
|
||||
int bufferSize =
|
||||
(int) ((fileSize < tailSizeInBytes) ? fileSize : tailSizeInBytes);
|
||||
byte[] tailBuffer = new byte[bufferSize];
|
||||
errorFileIS = fileSystem.open(errorFile);
|
||||
errorFileIS.readFully(startPosition, tailBuffer);
|
||||
|
||||
diagnosticInfo.append("Last ").append(tailSizeInBytes)
|
||||
.append(" bytes of ").append(errorFile.getName()).append(" :\n")
|
||||
.append(new String(tailBuffer, StandardCharsets.UTF_8));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
LOG.error("Failed to get tail of the container's error log file", e);
|
||||
} finally {
|
||||
IOUtils.cleanup(LOG, errorFileIS);
|
||||
}
|
||||
|
||||
this.dispatcher.getEventHandler()
|
||||
.handle(new ContainerExitEvent(containerID,
|
||||
ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret,
|
||||
diagnosticInfo.toString()));
|
||||
}
|
||||
|
||||
protected String getPidFileSubpath(String appIdStr, String containerIdStr) {
|
||||
return getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR
|
||||
+ String.format(ContainerLaunch.PID_FILE_NAME_FMT, containerIdStr);
|
||||
|
@ -21,7 +21,6 @@
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertThat;
|
||||
import static org.junit.Assert.fail;
|
||||
import static org.junit.matchers.JUnitMatchers.containsString;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
@ -48,6 +47,7 @@
|
||||
import org.apache.hadoop.fs.FileUtil;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||
import org.apache.hadoop.security.Credentials;
|
||||
import org.apache.hadoop.security.token.SecretManager.InvalidToken;
|
||||
import org.apache.hadoop.util.Shell;
|
||||
import org.apache.hadoop.util.Shell.ExitCodeException;
|
||||
@ -81,19 +81,21 @@
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.ShellScriptBuilder;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM;
|
||||
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
|
||||
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
||||
import org.apache.hadoop.yarn.util.Apps;
|
||||
import org.apache.hadoop.yarn.util.AuxiliaryServiceHelper;
|
||||
@ -108,6 +110,7 @@
|
||||
|
||||
public class TestContainerLaunch extends BaseContainerManagerTest {
|
||||
|
||||
private static final String INVALID_JAVA_HOME = "/no/jvm/here";
|
||||
protected Context distContext = new NMContext(new NMContainerTokenSecretManager(
|
||||
conf), new NMTokenSecretManagerInNM(), null,
|
||||
new ApplicationACLsManager(conf), new NMNullStateStoreService()) {
|
||||
@ -492,6 +495,147 @@ public void handle(Event event) {
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testErrorLogOnContainerExit() throws Exception {
|
||||
verifyTailErrorLogOnContainerExit(new Configuration(), "/stderr", false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testErrorLogOnContainerExitForCase() throws Exception {
|
||||
verifyTailErrorLogOnContainerExit(new Configuration(), "/STDERR.log",
|
||||
false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testErrorLogOnContainerExitForExt() throws Exception {
|
||||
verifyTailErrorLogOnContainerExit(new Configuration(), "/AppMaster.stderr",
|
||||
false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testErrorLogOnContainerExitWithCustomPattern() throws Exception {
|
||||
Configuration conf = new Configuration();
|
||||
conf.setStrings(YarnConfiguration.NM_CONTAINER_STDERR_PATTERN,
|
||||
"{*stderr*,*log*}");
|
||||
verifyTailErrorLogOnContainerExit(conf, "/error.log", false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testErrorLogOnContainerExitWithMultipleFiles() throws Exception {
|
||||
Configuration conf = new Configuration();
|
||||
conf.setStrings(YarnConfiguration.NM_CONTAINER_STDERR_PATTERN,
|
||||
"{*stderr*,*stdout*}");
|
||||
verifyTailErrorLogOnContainerExit(conf, "/stderr.log", true);
|
||||
}
|
||||
|
||||
private void verifyTailErrorLogOnContainerExit(Configuration conf,
|
||||
String errorFileName, boolean testForMultipleErrFiles) throws Exception {
|
||||
Container container = mock(Container.class);
|
||||
ApplicationId appId =
|
||||
ApplicationId.newInstance(System.currentTimeMillis(), 1);
|
||||
ContainerId containerId = ContainerId
|
||||
.newContainerId(ApplicationAttemptId.newInstance(appId, 1), 1);
|
||||
when(container.getContainerId()).thenReturn(containerId);
|
||||
when(container.getUser()).thenReturn("test");
|
||||
String relativeContainerLogDir = ContainerLaunch.getRelativeContainerLogDir(
|
||||
appId.toString(), ConverterUtils.toString(containerId));
|
||||
Path containerLogDir =
|
||||
dirsHandler.getLogPathForWrite(relativeContainerLogDir, false);
|
||||
|
||||
ContainerLaunchContext clc = mock(ContainerLaunchContext.class);
|
||||
List<String> invalidCommand = new ArrayList<String>();
|
||||
invalidCommand.add("$JAVA_HOME/bin/java");
|
||||
invalidCommand.add("-Djava.io.tmpdir=$PWD/tmp");
|
||||
invalidCommand.add("-Dlog4j.configuration=container-log4j.properties");
|
||||
invalidCommand.add("-Dyarn.app.container.log.dir=" + containerLogDir);
|
||||
invalidCommand.add("-Dyarn.app.container.log.filesize=0");
|
||||
invalidCommand.add("-Dhadoop.root.logger=INFO,CLA");
|
||||
invalidCommand.add("-Dhadoop.root.logfile=syslog");
|
||||
invalidCommand.add("-Xmx1024m");
|
||||
invalidCommand.add("org.apache.hadoop.mapreduce.v2.app.MRAppMaster");
|
||||
invalidCommand.add("1>" + containerLogDir + "/stdout");
|
||||
invalidCommand.add("2>" + containerLogDir + errorFileName);
|
||||
when(clc.getCommands()).thenReturn(invalidCommand);
|
||||
|
||||
Map<String, String> userSetEnv = new HashMap<String, String>();
|
||||
userSetEnv.put(Environment.CONTAINER_ID.name(), "user_set_container_id");
|
||||
userSetEnv.put("JAVA_HOME", INVALID_JAVA_HOME);
|
||||
userSetEnv.put(Environment.NM_HOST.name(), "user_set_NM_HOST");
|
||||
userSetEnv.put(Environment.NM_PORT.name(), "user_set_NM_PORT");
|
||||
userSetEnv.put(Environment.NM_HTTP_PORT.name(), "user_set_NM_HTTP_PORT");
|
||||
userSetEnv.put(Environment.LOCAL_DIRS.name(), "user_set_LOCAL_DIR");
|
||||
userSetEnv.put(Environment.USER.key(),
|
||||
"user_set_" + Environment.USER.key());
|
||||
userSetEnv.put(Environment.LOGNAME.name(), "user_set_LOGNAME");
|
||||
userSetEnv.put(Environment.PWD.name(), "user_set_PWD");
|
||||
userSetEnv.put(Environment.HOME.name(), "user_set_HOME");
|
||||
userSetEnv.put(Environment.CLASSPATH.name(), "APATH");
|
||||
when(clc.getEnvironment()).thenReturn(userSetEnv);
|
||||
when(container.getLaunchContext()).thenReturn(clc);
|
||||
|
||||
when(container.getLocalizedResources())
|
||||
.thenReturn(Collections.<Path, List<String>> emptyMap());
|
||||
Dispatcher dispatcher = mock(Dispatcher.class);
|
||||
|
||||
@SuppressWarnings("rawtypes")
|
||||
ContainerExitHandler eventHandler =
|
||||
new ContainerExitHandler(testForMultipleErrFiles);
|
||||
when(dispatcher.getEventHandler()).thenReturn(eventHandler);
|
||||
|
||||
Application app = mock(Application.class);
|
||||
when(app.getAppId()).thenReturn(appId);
|
||||
when(app.getUser()).thenReturn("test");
|
||||
|
||||
Credentials creds = mock(Credentials.class);
|
||||
when(container.getCredentials()).thenReturn(creds);
|
||||
|
||||
((NMContext) context).setNodeId(NodeId.newInstance("127.0.0.1", HTTP_PORT));
|
||||
|
||||
ContainerLaunch launch = new ContainerLaunch(context, conf, dispatcher,
|
||||
exec, app, container, dirsHandler, containerManager);
|
||||
launch.call();
|
||||
Assert.assertTrue("ContainerExitEvent should have occured",
|
||||
eventHandler.isContainerExitEventOccured());
|
||||
}
|
||||
|
||||
private static class ContainerExitHandler
|
||||
implements EventHandler<ContainerEvent> {
|
||||
private boolean testForMultiFile;
|
||||
|
||||
ContainerExitHandler(boolean testForMultiFile) {
|
||||
this.testForMultiFile = testForMultiFile;
|
||||
}
|
||||
|
||||
boolean containerExitEventOccured = false;
|
||||
|
||||
public boolean isContainerExitEventOccured() {
|
||||
return containerExitEventOccured;
|
||||
}
|
||||
|
||||
public void handle(ContainerEvent event) {
|
||||
if (event instanceof ContainerExitEvent) {
|
||||
containerExitEventOccured = true;
|
||||
ContainerExitEvent exitEvent = (ContainerExitEvent) event;
|
||||
Assert.assertEquals(ContainerEventType.CONTAINER_EXITED_WITH_FAILURE,
|
||||
exitEvent.getType());
|
||||
LOG.info("Diagnostic Info : " + exitEvent.getDiagnosticInfo());
|
||||
if (testForMultiFile) {
|
||||
Assert.assertTrue("Should contain the Multi file information",
|
||||
exitEvent.getDiagnosticInfo().contains("Error files: "));
|
||||
}
|
||||
Assert.assertTrue(
|
||||
"Should contain the error Log message with tail size info",
|
||||
exitEvent.getDiagnosticInfo()
|
||||
.contains("Last "
|
||||
+ YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_BYTES
|
||||
+ " bytes of"));
|
||||
Assert.assertTrue("Should contain contents of error Log",
|
||||
exitEvent.getDiagnosticInfo().contains(
|
||||
INVALID_JAVA_HOME + "/bin/java: No such file or directory"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static List<String> getJarManifestClasspath(String path)
|
||||
throws Exception {
|
||||
List<String> classpath = new ArrayList<String>();
|
||||
|
Loading…
Reference in New Issue
Block a user