From 658b1bf561aa1c0f02fc0dbb079011f959709c25 Mon Sep 17 00:00:00 2001 From: Jason Darrell Lowe Date: Wed, 26 Feb 2014 21:32:34 +0000 Subject: [PATCH] MAPREDUCE-5754. Preserve Job diagnostics in history. Contributed by Gera Shegalov git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1572269 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-mapreduce-project/CHANGES.txt | 3 + .../jobhistory/JobHistoryEventHandler.java | 8 +- .../mapreduce/v2/app/job/impl/JobImpl.java | 12 +- .../mapreduce/jobhistory/TestEvents.java | 12 +- .../src/main/avro/Events.avpr | 3 +- .../jobhistory/JobHistoryParser.java | 5 +- .../JobUnsuccessfulCompletionEvent.java | 57 ++++++-- .../hadoop-mapreduce-client-hs/pom.xml | 2 +- .../v2/hs/TestJobHistoryParsing.java | 138 +++++++++++++++++- ...835-0-0-FAILED-default-1393307693920.jhist | 19 +++ 10 files changed, 224 insertions(+), 35 deletions(-) create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/resources/job_1393307629410_0001-1393307687476-user-Sleep+job-1393307723835-0-0-FAILED-default-1393307693920.jhist diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index 65ade2c57a..b0d707d038 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -176,6 +176,9 @@ Release 2.4.0 - UNRELEASED MAPREDUCE-5761. Added a simple log message to denote when encrypted shuffle is on in the shuffle-handler. (Jian He via vinodkv) + MAPREDUCE-5754. Preserve Job diagnostics in history (Gera Shegalov via + jlowe) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobHistoryEventHandler.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobHistoryEventHandler.java index 3de5ba7e76..7ab02287b1 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobHistoryEventHandler.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobHistoryEventHandler.java @@ -48,6 +48,7 @@ import org.apache.hadoop.mapreduce.v2.api.records.JobId; import org.apache.hadoop.mapreduce.v2.api.records.JobState; import org.apache.hadoop.mapreduce.v2.app.AppContext; +import org.apache.hadoop.mapreduce.v2.app.job.Job; import org.apache.hadoop.mapreduce.v2.jobhistory.FileNameIndexUtils; import org.apache.hadoop.mapreduce.v2.jobhistory.JobHistoryUtils; import org.apache.hadoop.mapreduce.v2.jobhistory.JobIndexInfo; @@ -343,11 +344,12 @@ protected void serviceStop() throws Exception { LOG.warn("Found jobId " + toClose + " to have not been closed. Will close"); //Create a JobFinishEvent so that it is written to the job history + final Job job = context.getJob(toClose); JobUnsuccessfulCompletionEvent jucEvent = new JobUnsuccessfulCompletionEvent(TypeConverter.fromYarn(toClose), - System.currentTimeMillis(), context.getJob(toClose) - .getCompletedMaps(), context.getJob(toClose).getCompletedReduces(), - JobState.KILLED.toString()); + System.currentTimeMillis(), job.getCompletedMaps(), + job.getCompletedReduces(), JobState.KILLED.toString(), + job.getDiagnostics()); JobHistoryEvent jfEvent = new JobHistoryEvent(toClose, jucEvent); //Bypass the queue mechanism which might wait. Call the method directly handleEvent(jfEvent); diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/JobImpl.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/JobImpl.java index 88dc99eb36..36bfca7183 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/JobImpl.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/JobImpl.java @@ -149,6 +149,9 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job, // Maximum no. of fetch-failure notifications after which map task is failed private static final int MAX_FETCH_FAILURES_NOTIFICATIONS = 3; + + public static final String JOB_KILLED_DIAG = + "Job received Kill while in RUNNING state."; //final fields private final ApplicationAttemptId applicationAttemptId; @@ -1617,7 +1620,8 @@ private void unsuccessfulFinish(JobStateInternal finalState) { finishTime, succeededMapTaskCount, succeededReduceTaskCount, - finalState.toString()); + finalState.toString(), + diagnostics); eventHandler.handle(new JobHistoryEvent(jobId, unsuccessfulJobEvent)); finished(finalState); @@ -1730,7 +1734,7 @@ public void transition(JobImpl job, JobEvent event) { JobUnsuccessfulCompletionEvent failedEvent = new JobUnsuccessfulCompletionEvent(job.oldJobId, job.finishTime, 0, 0, - JobStateInternal.KILLED.toString()); + JobStateInternal.KILLED.toString(), job.diagnostics); job.eventHandler.handle(new JobHistoryEvent(job.jobId, failedEvent)); job.finished(JobStateInternal.KILLED); } @@ -1763,7 +1767,7 @@ private static class KillTasksTransition implements SingleArcTransition { @Override public void transition(JobImpl job, JobEvent event) { - job.addDiagnostic("Job received Kill while in RUNNING state."); + job.addDiagnostic(JOB_KILLED_DIAG); for (Task task : job.tasks.values()) { job.eventHandler.handle( new TaskEvent(task.getID(), TaskEventType.T_KILL)); @@ -2127,7 +2131,7 @@ public void transition(JobImpl job, JobEvent event) { JobUnsuccessfulCompletionEvent failedEvent = new JobUnsuccessfulCompletionEvent(job.oldJobId, job.finishTime, 0, 0, - jobHistoryString); + jobHistoryString, job.diagnostics); job.eventHandler.handle(new JobHistoryEvent(job.jobId, failedEvent)); job.finished(terminationState); } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/jobhistory/TestEvents.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/jobhistory/TestEvents.java index d383f2912f..59ba5b0181 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/jobhistory/TestEvents.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/jobhistory/TestEvents.java @@ -34,6 +34,7 @@ import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.TaskID; import org.apache.hadoop.mapreduce.TaskType; +import org.apache.hadoop.mapreduce.v2.app.job.impl.JobImpl; import org.junit.Test; public class TestEvents { @@ -334,11 +335,12 @@ private FakeEvent getSetupAttemptStartedEvent() { private FakeEvent getJobKilledEvent() { FakeEvent result = new FakeEvent(EventType.JOB_KILLED); JobUnsuccessfulCompletion datum = new JobUnsuccessfulCompletion(); - datum.finishedMaps = 1; - datum.finishedReduces = 2; - datum.finishTime = 3; - datum.jobid = "ID"; - datum.jobStatus = "STATUS"; + datum.setFinishedMaps(1); + datum.setFinishedReduces(2); + datum.setFinishTime(3L); + datum.setJobid("ID"); + datum.setJobStatus("STATUS"); + datum.setDiagnostics(JobImpl.JOB_KILLED_DIAG); result.setDatum(datum); return result; } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/avro/Events.avpr b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/avro/Events.avpr index fa194338cf..2d924d522b 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/avro/Events.avpr +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/avro/Events.avpr @@ -135,7 +135,8 @@ {"name": "finishTime", "type": "long"}, {"name": "finishedMaps", "type": "int"}, {"name": "finishedReduces", "type": "int"}, - {"name": "jobStatus", "type": "string"} + {"name": "jobStatus", "type": "string"}, + {"name": "diagnostics", "type": "string"} ] }, diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobHistoryParser.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobHistoryParser.java index 19e2a51a13..295de6373d 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobHistoryParser.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobHistoryParser.java @@ -353,10 +353,6 @@ private void handleTaskFailedEvent(TaskFailedEvent event) { taskInfo.error = StringInterner.weakIntern(event.getError()); taskInfo.failedDueToAttemptId = event.getFailedAttemptID(); taskInfo.counters = event.getCounters(); - if (info.errorInfo.isEmpty()) { - info.errorInfo = "Task " + taskInfo.taskId + " failed " + - taskInfo.attemptsMap.size() + " times "; - } } private void handleTaskStartedEvent(TaskStartedEvent event) { @@ -373,6 +369,7 @@ private void handleJobFailedEvent(JobUnsuccessfulCompletionEvent event) { info.finishedMaps = event.getFinishedMaps(); info.finishedReduces = event.getFinishedReduces(); info.jobStatus = StringInterner.weakIntern(event.getStatus()); + info.errorInfo = StringInterner.weakIntern(event.getDiagnostics()); } private void handleJobFinishedEvent(JobFinishedEvent event) { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobUnsuccessfulCompletionEvent.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobUnsuccessfulCompletionEvent.java index 3adb91f2a8..2d6a68e1d4 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobUnsuccessfulCompletionEvent.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobUnsuccessfulCompletionEvent.java @@ -18,11 +18,15 @@ package org.apache.hadoop.mapreduce.jobhistory; +import com.google.common.base.Joiner; + import org.apache.avro.util.Utf8; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.mapreduce.JobID; +import java.util.Collections; + /** * Event to record Failed and Killed completion of jobs * @@ -30,6 +34,10 @@ @InterfaceAudience.Private @InterfaceStability.Unstable public class JobUnsuccessfulCompletionEvent implements HistoryEvent { + private static final String NODIAGS = ""; + private static final Iterable NODIAGS_LIST = + Collections.singletonList(NODIAGS); + private JobUnsuccessfulCompletion datum = new JobUnsuccessfulCompletion(); @@ -44,11 +52,33 @@ public class JobUnsuccessfulCompletionEvent implements HistoryEvent { public JobUnsuccessfulCompletionEvent(JobID id, long finishTime, int finishedMaps, int finishedReduces, String status) { - datum.jobid = new Utf8(id.toString()); - datum.finishTime = finishTime; - datum.finishedMaps = finishedMaps; - datum.finishedReduces = finishedReduces; - datum.jobStatus = new Utf8(status); + this(id, finishTime, finishedMaps, finishedReduces, status, NODIAGS_LIST); + } + + /** + * Create an event to record unsuccessful completion (killed/failed) of jobs + * @param id Job ID + * @param finishTime Finish time of the job + * @param finishedMaps Number of finished maps + * @param finishedReduces Number of finished reduces + * @param status Status of the job + * @param diagnostics job runtime diagnostics + */ + public JobUnsuccessfulCompletionEvent(JobID id, long finishTime, + int finishedMaps, + int finishedReduces, + String status, + Iterable diagnostics) { + datum.setJobid(new Utf8(id.toString())); + datum.setFinishTime(finishTime); + datum.setFinishedMaps(finishedMaps); + datum.setFinishedReduces(finishedReduces); + datum.setJobStatus(new Utf8(status)); + if (diagnostics == null) { + diagnostics = NODIAGS_LIST; + } + datum.setDiagnostics(new Utf8(Joiner.on('\n').skipNulls() + .join(diagnostics))); } JobUnsuccessfulCompletionEvent() {} @@ -61,13 +91,13 @@ public void setDatum(Object datum) { /** Get the Job ID */ public JobID getJobId() { return JobID.forName(datum.jobid.toString()); } /** Get the job finish time */ - public long getFinishTime() { return datum.finishTime; } + public long getFinishTime() { return datum.getFinishTime(); } /** Get the number of finished maps */ - public int getFinishedMaps() { return datum.finishedMaps; } + public int getFinishedMaps() { return datum.getFinishedMaps(); } /** Get the number of finished reduces */ - public int getFinishedReduces() { return datum.finishedReduces; } + public int getFinishedReduces() { return datum.getFinishedReduces(); } /** Get the status */ - public String getStatus() { return datum.jobStatus.toString(); } + public String getStatus() { return datum.getJobStatus().toString(); } /** Get the event type */ public EventType getEventType() { if ("FAILED".equals(getStatus())) { @@ -78,4 +108,13 @@ public EventType getEventType() { return EventType.JOB_KILLED; } + /** + * Retrieves diagnostics information preserved in the history file + * + * @return diagnostics as of the time of job termination + */ + public String getDiagnostics() { + final CharSequence diagnostics = datum.getDiagnostics(); + return diagnostics == null ? NODIAGS : diagnostics.toString(); + } } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml index 3ac191fb86..ad887bba63 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml @@ -73,7 +73,7 @@ src/test/resources/job_1329348432655_0001_conf.xml - src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist + src/test/resources/*.jhist diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestJobHistoryParsing.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestJobHistoryParsing.java index 3d704ef053..8c0c4963f6 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestJobHistoryParsing.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestJobHistoryParsing.java @@ -18,6 +18,10 @@ package org.apache.hadoop.mapreduce.v2.hs; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic + .NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; @@ -25,6 +29,7 @@ import java.io.IOException; import java.io.PrintStream; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -36,9 +41,9 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileContext; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.JobID; @@ -53,6 +58,7 @@ import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.JobInfo; import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskAttemptInfo; import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo; +import org.apache.hadoop.mapreduce.jobhistory.JobUnsuccessfulCompletionEvent; import org.apache.hadoop.mapreduce.jobhistory.TaskFailedEvent; import org.apache.hadoop.mapreduce.jobhistory.TaskFinishedEvent; import org.apache.hadoop.mapreduce.jobhistory.TaskStartedEvent; @@ -66,8 +72,11 @@ import org.apache.hadoop.mapreduce.v2.api.records.impl.pb.TaskIdPBImpl; import org.apache.hadoop.mapreduce.v2.app.MRApp; import org.apache.hadoop.mapreduce.v2.app.job.Job; +import org.apache.hadoop.mapreduce.v2.app.job.impl.JobImpl; import org.apache.hadoop.mapreduce.v2.app.job.Task; import org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt; +import org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent; +import org.apache.hadoop.mapreduce.v2.app.job.event.JobEventType; import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent; import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType; import org.apache.hadoop.mapreduce.v2.hs.HistoryFileManager.HistoryFileInfo; @@ -149,7 +158,7 @@ private void checkHistoryParsing(final int numMaps, final int numReduces, conf.set(MRJobConfig.USER_NAME, System.getProperty("user.name")); long amStartTimeEst = System.currentTimeMillis(); conf.setClass( - CommonConfigurationKeysPublic.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, + NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, MyResolver.class, DNSToSwitchMapping.class); RackResolver.init(conf); MRApp app = new MRAppWithHistory(numMaps, numReduces, true, this.getClass() @@ -390,7 +399,7 @@ public void testHistoryParsingForFailedAttempts() throws Exception { try { Configuration conf = new Configuration(); conf.setClass( - CommonConfigurationKeysPublic.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, + NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, MyResolver.class, DNSToSwitchMapping.class); RackResolver.init(conf); MRApp app = new MRAppWithHistoryWithFailedAttempt(2, 1, true, this @@ -455,7 +464,7 @@ public void testCountersForFailedTask() throws Exception { try { Configuration conf = new Configuration(); conf.setClass( - CommonConfigurationKeysPublic.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, + NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, MyResolver.class, DNSToSwitchMapping.class); RackResolver.init(conf); MRApp app = new MRAppWithHistoryWithFailedTask(2, 1, true, this @@ -499,18 +508,85 @@ public void testCountersForFailedTask() throws Exception { Assert.assertNotNull("completed task report has null counters", ct .getReport().getCounters()); } + final List originalDiagnostics = job.getDiagnostics(); + final String historyError = jobInfo.getErrorInfo(); + assertTrue("No original diagnostics for a failed job", + originalDiagnostics != null && !originalDiagnostics.isEmpty()); + assertNotNull("No history error info for a failed job ", historyError); + for (String diagString : originalDiagnostics) { + assertTrue(historyError.contains(diagString)); + } } finally { LOG.info("FINISHED testCountersForFailedTask"); } } + @Test(timeout = 60000) + public void testDiagnosticsForKilledJob() throws Exception { + LOG.info("STARTING testDiagnosticsForKilledJob"); + try { + final Configuration conf = new Configuration(); + conf.setClass( + NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, + MyResolver.class, DNSToSwitchMapping.class); + RackResolver.init(conf); + MRApp app = new MRAppWithHistoryWithJobKilled(2, 1, true, this + .getClass().getName(), true); + app.submit(conf); + Job job = app.getContext().getAllJobs().values().iterator().next(); + JobId jobId = job.getID(); + app.waitForState(job, JobState.KILLED); + + // make sure all events are flushed + app.waitForState(Service.STATE.STOPPED); + + JobHistory jobHistory = new JobHistory(); + jobHistory.init(conf); + + HistoryFileInfo fileInfo = jobHistory.getJobFileInfo(jobId); + + JobHistoryParser parser; + JobInfo jobInfo; + synchronized (fileInfo) { + Path historyFilePath = fileInfo.getHistoryFile(); + FSDataInputStream in = null; + FileContext fc = null; + try { + fc = FileContext.getFileContext(conf); + in = fc.open(fc.makeQualified(historyFilePath)); + } catch (IOException ioe) { + LOG.info("Can not open history file: " + historyFilePath, ioe); + throw (new Exception("Can not open History File")); + } + + parser = new JobHistoryParser(in); + jobInfo = parser.parse(); + } + Exception parseException = parser.getParseException(); + assertNull("Caught an expected exception " + parseException, + parseException); + final List originalDiagnostics = job.getDiagnostics(); + final String historyError = jobInfo.getErrorInfo(); + assertTrue("No original diagnostics for a failed job", + originalDiagnostics != null && !originalDiagnostics.isEmpty()); + assertNotNull("No history error info for a failed job ", historyError); + for (String diagString : originalDiagnostics) { + assertTrue(historyError.contains(diagString)); + } + assertTrue("No killed message in diagnostics", + historyError.contains(JobImpl.JOB_KILLED_DIAG)); + } finally { + LOG.info("FINISHED testDiagnosticsForKilledJob"); + } + } + @Test(timeout = 50000) public void testScanningOldDirs() throws Exception { LOG.info("STARTING testScanningOldDirs"); try { Configuration conf = new Configuration(); conf.setClass( - CommonConfigurationKeysPublic.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, + NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, MyResolver.class, DNSToSwitchMapping.class); RackResolver.init(conf); MRApp app = new MRAppWithHistory(1, 1, true, this.getClass().getName(), @@ -590,6 +666,27 @@ protected void attemptLaunched(TaskAttemptId attemptID) { } } + static class MRAppWithHistoryWithJobKilled extends MRAppWithHistory { + + public MRAppWithHistoryWithJobKilled(int maps, int reduces, + boolean autoComplete, String testName, boolean cleanOnStart) { + super(maps, reduces, autoComplete, testName, cleanOnStart); + } + + @SuppressWarnings("unchecked") + @Override + protected void attemptLaunched(TaskAttemptId attemptID) { + if (attemptID.getTaskId().getId() == 0) { + getContext().getEventHandler().handle( + new JobEvent(attemptID.getTaskId().getJobId(), + JobEventType.JOB_KILL)); + } else { + getContext().getEventHandler().handle( + new TaskAttemptEvent(attemptID, TaskAttemptEventType.TA_DONE)); + } + } + } + static class HistoryFileManagerForTest extends HistoryFileManager { void deleteJobFromJobListCache(HistoryFileInfo fileInfo) { jobListCache.delete(fileInfo); @@ -613,7 +710,7 @@ public void testDeleteFileInfo() throws Exception { Configuration conf = new Configuration(); conf.setClass( - CommonConfigurationKeysPublic.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, + NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, MyResolver.class, DNSToSwitchMapping.class); RackResolver.init(conf); @@ -668,7 +765,7 @@ public void testJobHistoryMethods() throws Exception { Configuration configuration = new Configuration(); configuration .setClass( - CommonConfigurationKeysPublic.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, + NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, MyResolver.class, DNSToSwitchMapping.class); RackResolver.init(configuration); @@ -743,7 +840,7 @@ public void testMultipleFailedTasks() throws Exception { final org.apache.hadoop.mapreduce.TaskType taskType = org.apache.hadoop.mapreduce.TaskType.MAP; final TaskID[] tids = new TaskID[2]; - JobID jid = new JobID("1", 1); + final JobID jid = new JobID("1", 1); tids[0] = new TaskID(jid, taskType, 0); tids[1] = new TaskID(jid, taskType, 1); Mockito.when(reader.getNextEvent()).thenAnswer( @@ -762,6 +859,13 @@ public HistoryEvent answer(InvocationOnMock invocation) tfe.setDatum(tfe.getDatum()); return tfe; } + if (eventId < 5) { + JobUnsuccessfulCompletionEvent juce = + new JobUnsuccessfulCompletionEvent(jid, 100L, 2, 0, + "JOB_FAILED", Collections.singletonList( + "Task failed: " + tids[0].toString())); + return juce; + } return null; } }); @@ -769,4 +873,22 @@ public HistoryEvent answer(InvocationOnMock invocation) assertTrue("Task 0 not implicated", info.getErrorInfo().contains(tids[0].toString())); } + + @Test + public void testFailedJobHistoryWithoutDiagnostics() throws Exception { + final Path histPath = new Path(getClass().getClassLoader().getResource( + "job_1393307629410_0001-1393307687476-user-Sleep+job-1393307723835-0-0-FAILED-default-1393307693920.jhist") + .getFile()); + final FileSystem lfs = FileSystem.getLocal(new Configuration()); + final FSDataInputStream fsdis = lfs.open(histPath); + try { + JobHistoryParser parser = new JobHistoryParser(fsdis); + JobInfo info = parser.parse(); + assertEquals("History parsed jobId incorrectly", + info.getJobId(), JobID.forName("job_1393307629410_0001") ); + assertEquals("Default diagnostics incorrect ", "", info.getErrorInfo()); + } finally { + fsdis.close(); + } + } } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/resources/job_1393307629410_0001-1393307687476-user-Sleep+job-1393307723835-0-0-FAILED-default-1393307693920.jhist b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/resources/job_1393307629410_0001-1393307687476-user-Sleep+job-1393307723835-0-0-FAILED-default-1393307693920.jhist new file mode 100644 index 0000000000..fafe451443 --- /dev/null +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/resources/job_1393307629410_0001-1393307687476-user-Sleep+job-1393307723835-0-0-FAILED-default-1393307693920.jhist @@ -0,0 +1,19 @@ +Avro-Json +{"type":"record","name":"Event","namespace":"org.apache.hadoop.mapreduce.jobhistory","fields":[{"name":"type","type":{"type":"enum","name":"EventType","symbols":["JOB_SUBMITTED","JOB_INITED","JOB_FINISHED","JOB_PRIORITY_CHANGED","JOB_STATUS_CHANGED","JOB_FAILED","JOB_KILLED","JOB_ERROR","JOB_INFO_CHANGED","TASK_STARTED","TASK_FINISHED","TASK_FAILED","TASK_UPDATED","NORMALIZED_RESOURCE","MAP_ATTEMPT_STARTED","MAP_ATTEMPT_FINISHED","MAP_ATTEMPT_FAILED","MAP_ATTEMPT_KILLED","REDUCE_ATTEMPT_STARTED","REDUCE_ATTEMPT_FINISHED","REDUCE_ATTEMPT_FAILED","REDUCE_ATTEMPT_KILLED","SETUP_ATTEMPT_STARTED","SETUP_ATTEMPT_FINISHED","SETUP_ATTEMPT_FAILED","SETUP_ATTEMPT_KILLED","CLEANUP_ATTEMPT_STARTED","CLEANUP_ATTEMPT_FINISHED","CLEANUP_ATTEMPT_FAILED","CLEANUP_ATTEMPT_KILLED","AM_STARTED"]}},{"name":"event","type":[{"type":"record","name":"JobFinished","fields":[{"name":"jobid","type":"string"},{"name":"finishTime","type":"long"},{"name":"finishedMaps","type":"int"},{"name":"finishedReduces","type":"int"},{"name":"failedMaps","type":"int"},{"name":"failedReduces","type":"int"},{"name":"totalCounters","type":{"type":"record","name":"JhCounters","fields":[{"name":"name","type":"string"},{"name":"groups","type":{"type":"array","items":{"type":"record","name":"JhCounterGroup","fields":[{"name":"name","type":"string"},{"name":"displayName","type":"string"},{"name":"counts","type":{"type":"array","items":{"type":"record","name":"JhCounter","fields":[{"name":"name","type":"string"},{"name":"displayName","type":"string"},{"name":"value","type":"long"}]}}}]}}}]}},{"name":"mapCounters","type":"JhCounters"},{"name":"reduceCounters","type":"JhCounters"}]},{"type":"record","name":"JobInfoChange","fields":[{"name":"jobid","type":"string"},{"name":"submitTime","type":"long"},{"name":"launchTime","type":"long"}]},{"type":"record","name":"JobInited","fields":[{"name":"jobid","type":"string"},{"name":"launchTime","type":"long"},{"name":"totalMaps","type":"int"},{"name":"totalReduces","type":"int"},{"name":"jobStatus","type":"string"},{"name":"uberized","type":"boolean"}]},{"type":"record","name":"AMStarted","fields":[{"name":"applicationAttemptId","type":"string"},{"name":"startTime","type":"long"},{"name":"containerId","type":"string"},{"name":"nodeManagerHost","type":"string"},{"name":"nodeManagerPort","type":"int"},{"name":"nodeManagerHttpPort","type":"int"}]},{"type":"record","name":"JobPriorityChange","fields":[{"name":"jobid","type":"string"},{"name":"priority","type":"string"}]},{"type":"record","name":"JobStatusChanged","fields":[{"name":"jobid","type":"string"},{"name":"jobStatus","type":"string"}]},{"type":"record","name":"JobSubmitted","fields":[{"name":"jobid","type":"string"},{"name":"jobName","type":"string"},{"name":"userName","type":"string"},{"name":"submitTime","type":"long"},{"name":"jobConfPath","type":"string"},{"name":"acls","type":{"type":"map","values":"string"}},{"name":"jobQueueName","type":"string"},{"name":"workflowId","type":"string"},{"name":"workflowName","type":"string"},{"name":"workflowNodeName","type":"string"},{"name":"workflowAdjacencies","type":"string"},{"name":"workflowTags","type":"string"}]},{"type":"record","name":"JobUnsuccessfulCompletion","fields":[{"name":"jobid","type":"string"},{"name":"finishTime","type":"long"},{"name":"finishedMaps","type":"int"},{"name":"finishedReduces","type":"int"},{"name":"jobStatus","type":"string"}]},{"type":"record","name":"MapAttemptFinished","fields":[{"name":"taskid","type":"string"},{"name":"attemptId","type":"string"},{"name":"taskType","type":"string"},{"name":"taskStatus","type":"string"},{"name":"mapFinishTime","type":"long"},{"name":"finishTime","type":"long"},{"name":"hostname","type":"string"},{"name":"port","type":"int"},{"name":"rackname","type":"string"},{"name":"state","type":"string"},{"name":"counters","type":"JhCounters"},{"name":"clockSplits","type":{"type":"array","items":"int"}},{"name":"cpuUsages","type":{"type":"array","items":"int"}},{"name":"vMemKbytes","type":{"type":"array","items":"int"}},{"name":"physMemKbytes","type":{"type":"array","items":"int"}}]},{"type":"record","name":"ReduceAttemptFinished","fields":[{"name":"taskid","type":"string"},{"name":"attemptId","type":"string"},{"name":"taskType","type":"string"},{"name":"taskStatus","type":"string"},{"name":"shuffleFinishTime","type":"long"},{"name":"sortFinishTime","type":"long"},{"name":"finishTime","type":"long"},{"name":"hostname","type":"string"},{"name":"port","type":"int"},{"name":"rackname","type":"string"},{"name":"state","type":"string"},{"name":"counters","type":"JhCounters"},{"name":"clockSplits","type":{"type":"array","items":"int"}},{"name":"cpuUsages","type":{"type":"array","items":"int"}},{"name":"vMemKbytes","type":{"type":"array","items":"int"}},{"name":"physMemKbytes","type":{"type":"array","items":"int"}}]},{"type":"record","name":"TaskAttemptFinished","fields":[{"name":"taskid","type":"string"},{"name":"attemptId","type":"string"},{"name":"taskType","type":"string"},{"name":"taskStatus","type":"string"},{"name":"finishTime","type":"long"},{"name":"rackname","type":"string"},{"name":"hostname","type":"string"},{"name":"state","type":"string"},{"name":"counters","type":"JhCounters"}]},{"type":"record","name":"TaskAttemptStarted","fields":[{"name":"taskid","type":"string"},{"name":"taskType","type":"string"},{"name":"attemptId","type":"string"},{"name":"startTime","type":"long"},{"name":"trackerName","type":"string"},{"name":"httpPort","type":"int"},{"name":"shufflePort","type":"int"},{"name":"containerId","type":"string"},{"name":"locality","type":"string"},{"name":"avataar","type":"string"}]},{"type":"record","name":"TaskAttemptUnsuccessfulCompletion","fields":[{"name":"taskid","type":"string"},{"name":"taskType","type":"string"},{"name":"attemptId","type":"string"},{"name":"finishTime","type":"long"},{"name":"hostname","type":"string"},{"name":"port","type":"int"},{"name":"rackname","type":"string"},{"name":"status","type":"string"},{"name":"error","type":"string"},{"name":"counters","type":"JhCounters"},{"name":"clockSplits","type":{"type":"array","items":"int"}},{"name":"cpuUsages","type":{"type":"array","items":"int"}},{"name":"vMemKbytes","type":{"type":"array","items":"int"}},{"name":"physMemKbytes","type":{"type":"array","items":"int"}}]},{"type":"record","name":"TaskFailed","fields":[{"name":"taskid","type":"string"},{"name":"taskType","type":"string"},{"name":"finishTime","type":"long"},{"name":"error","type":"string"},{"name":"failedDueToAttempt","type":["null","string"]},{"name":"status","type":"string"},{"name":"counters","type":"JhCounters"}]},{"type":"record","name":"TaskFinished","fields":[{"name":"taskid","type":"string"},{"name":"taskType","type":"string"},{"name":"finishTime","type":"long"},{"name":"status","type":"string"},{"name":"counters","type":"JhCounters"},{"name":"successfulAttemptId","type":"string"}]},{"type":"record","name":"TaskStarted","fields":[{"name":"taskid","type":"string"},{"name":"taskType","type":"string"},{"name":"startTime","type":"long"},{"name":"splitLocations","type":"string"}]},{"type":"record","name":"TaskUpdated","fields":[{"name":"taskid","type":"string"},{"name":"finishTime","type":"long"}]}]}]} +{"type":"AM_STARTED","event":{"org.apache.hadoop.mapreduce.jobhistory.AMStarted":{"applicationAttemptId":"appattempt_1393307629410_0001_000001","startTime":1393307691014,"containerId":"container_1393307629410_0001_01_000001","nodeManagerHost":"172.25.142.198","nodeManagerPort":57763,"nodeManagerHttpPort":8042}}} + +{"type":"JOB_SUBMITTED","event":{"org.apache.hadoop.mapreduce.jobhistory.JobSubmitted":{"jobid":"job_1393307629410_0001","jobName":"Sleep job","userName":"user","submitTime":1393307687476,"jobConfPath":"hdfs://localhost:9000/tmp/hadoop-yarn/staging/user/.staging/job_1393307629410_0001/job.xml","acls":{},"jobQueueName":"default","workflowId":"","workflowName":"","workflowNodeName":"","workflowAdjacencies":"","workflowTags":""}}} + +{"type":"JOB_INITED","event":{"org.apache.hadoop.mapreduce.jobhistory.JobInited":{"jobid":"job_1393307629410_0001","launchTime":1393307693920,"totalMaps":1,"totalReduces":0,"jobStatus":"INITED","uberized":false}}} + +{"type":"JOB_INFO_CHANGED","event":{"org.apache.hadoop.mapreduce.jobhistory.JobInfoChange":{"jobid":"job_1393307629410_0001","submitTime":1393307687476,"launchTime":1393307693920}}} + +{"type":"TASK_STARTED","event":{"org.apache.hadoop.mapreduce.jobhistory.TaskStarted":{"taskid":"task_1393307629410_0001_m_000000","taskType":"MAP","startTime":1393307693956,"splitLocations":""}}} + +{"type":"MAP_ATTEMPT_STARTED","event":{"org.apache.hadoop.mapreduce.jobhistory.TaskAttemptStarted":{"taskid":"task_1393307629410_0001_m_000000","taskType":"MAP","attemptId":"attempt_1393307629410_0001_m_000000_0","startTime":1393307696163,"trackerName":"172.25.142.198","httpPort":8042,"shufflePort":13562,"containerId":"container_1393307629410_0001_01_000002","locality":"OFF_SWITCH","avataar":"VIRGIN"}}} + +{"type":"MAP_ATTEMPT_FAILED","event":{"org.apache.hadoop.mapreduce.jobhistory.TaskAttemptUnsuccessfulCompletion":{"taskid":"task_1393307629410_0001_m_000000","taskType":"MAP","attemptId":"attempt_1393307629410_0001_m_000000_0","finishTime":1393307723803,"hostname":"172.25.142.198","port":57763,"rackname":"/default-rack","status":"FAILED","error":"AttemptID:attempt_1393307629410_0001_m_000000_0 Timed out after 5 secs","counters":{"name":"COUNTERS","groups":[{"name":"org.apache.hadoop.mapreduce.FileSystemCounter","displayName":"File System Counters","counts":[{"name":"FILE_BYTES_READ","displayName":"FILE: Number of bytes read","value":0},{"name":"FILE_BYTES_WRITTEN","displayName":"FILE: Number of bytes written","value":86662},{"name":"FILE_READ_OPS","displayName":"FILE: Number of read operations","value":0},{"name":"FILE_LARGE_READ_OPS","displayName":"FILE: Number of large read operations","value":0},{"name":"FILE_WRITE_OPS","displayName":"FILE: Number of write operations","value":0},{"name":"HDFS_BYTES_READ","displayName":"HDFS: Number of bytes read","value":48},{"name":"HDFS_BYTES_WRITTEN","displayName":"HDFS: Number of bytes written","value":0},{"name":"HDFS_READ_OPS","displayName":"HDFS: Number of read operations","value":1},{"name":"HDFS_LARGE_READ_OPS","displayName":"HDFS: Number of large read operations","value":0},{"name":"HDFS_WRITE_OPS","displayName":"HDFS: Number of write operations","value":0}]},{"name":"org.apache.hadoop.mapreduce.TaskCounter","displayName":"Map-Reduce Framework","counts":[{"name":"MAP_INPUT_RECORDS","displayName":"Map input records","value":1},{"name":"MAP_OUTPUT_RECORDS","displayName":"Map output records","value":0},{"name":"SPLIT_RAW_BYTES","displayName":"Input split bytes","value":48},{"name":"SPILLED_RECORDS","displayName":"Spilled Records","value":0},{"name":"FAILED_SHUFFLE","displayName":"Failed Shuffles","value":0},{"name":"MERGED_MAP_OUTPUTS","displayName":"Merged Map outputs","value":0},{"name":"GC_TIME_MILLIS","displayName":"GC time elapsed (ms)","value":13},{"name":"CPU_MILLISECONDS","displayName":"CPU time spent (ms)","value":0},{"name":"PHYSICAL_MEMORY_BYTES","displayName":"Physical memory (bytes) snapshot","value":0},{"name":"VIRTUAL_MEMORY_BYTES","displayName":"Virtual memory (bytes) snapshot","value":0},{"name":"COMMITTED_HEAP_BYTES","displayName":"Total committed heap usage (bytes)","value":85000192}]},{"name":"org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter","displayName":"File Input Format Counters ","counts":[{"name":"BYTES_READ","displayName":"Bytes Read","value":0}]},{"name":"org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter","displayName":"File Output Format Counters ","counts":[{"name":"BYTES_WRITTEN","displayName":"Bytes Written","value":0}]}]},"clockSplits":[747,747,748,747,747,748,747,747,748,747,747,748],"cpuUsages":[0,0,0,0,0,0,0,0,0,0,0,0],"vMemKbytes":[0,0,0,0,0,0,0,0,0,0,0,0],"physMemKbytes":[0,0,0,0,0,0,0,0,0,0,0,0]}}} + +{"type":"TASK_FAILED","event":{"org.apache.hadoop.mapreduce.jobhistory.TaskFailed":{"taskid":"task_1393307629410_0001_m_000000","taskType":"MAP","finishTime":1393307723803,"error":", AttemptID:attempt_1393307629410_0001_m_000000_0 Timed out after 5 secs","failedDueToAttempt":{"string":"attempt_1393307629410_0001_m_000000_0"},"status":"FAILED","counters":{"name":"COUNTERS","groups":[]}}}} + +{"type":"JOB_FAILED","event":{"org.apache.hadoop.mapreduce.jobhistory.JobUnsuccessfulCompletion":{"jobid":"job_1393307629410_0001","finishTime":1393307723835,"finishedMaps":0,"finishedReduces":0,"jobStatus":"FAILED"}}}