MAPREDUCE-4611. MR AM dies badly when Node is decommissioned (Robert Evans via tgraves)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1379599 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
63f941d2ad
commit
25e96e455b
@ -858,6 +858,9 @@ Release 0.23.3 - UNRELEASED
|
|||||||
|
|
||||||
MAPREDUCE-4614. Simplify debugging a job's tokens (daryn via bobby)
|
MAPREDUCE-4614. Simplify debugging a job's tokens (daryn via bobby)
|
||||||
|
|
||||||
|
MAPREDUCE-4611. MR AM dies badly when Node is decommissioned (Robert
|
||||||
|
Evans via tgraves)
|
||||||
|
|
||||||
Release 0.23.2 - UNRELEASED
|
Release 0.23.2 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
@ -99,8 +99,8 @@ public class JobHistoryEventHandler extends AbstractService
|
|||||||
protected static final Map<JobId, MetaInfo> fileMap =
|
protected static final Map<JobId, MetaInfo> fileMap =
|
||||||
Collections.<JobId,MetaInfo>synchronizedMap(new HashMap<JobId,MetaInfo>());
|
Collections.<JobId,MetaInfo>synchronizedMap(new HashMap<JobId,MetaInfo>());
|
||||||
|
|
||||||
// Has a signal (SIGTERM etc) been issued?
|
// should job completion be force when the AM shuts down?
|
||||||
protected volatile boolean isSignalled = false;
|
protected volatile boolean forceJobCompletion = false;
|
||||||
|
|
||||||
public JobHistoryEventHandler(AppContext context, int startCount) {
|
public JobHistoryEventHandler(AppContext context, int startCount) {
|
||||||
super("JobHistoryEventHandler");
|
super("JobHistoryEventHandler");
|
||||||
@ -322,7 +322,7 @@ public void stop() {
|
|||||||
// Process JobUnsuccessfulCompletionEvent for jobIds which still haven't
|
// Process JobUnsuccessfulCompletionEvent for jobIds which still haven't
|
||||||
// closed their event writers
|
// closed their event writers
|
||||||
Iterator<JobId> jobIt = fileMap.keySet().iterator();
|
Iterator<JobId> jobIt = fileMap.keySet().iterator();
|
||||||
if(isSignalled) {
|
if(forceJobCompletion) {
|
||||||
while (jobIt.hasNext()) {
|
while (jobIt.hasNext()) {
|
||||||
JobId toClose = jobIt.next();
|
JobId toClose = jobIt.next();
|
||||||
MetaInfo mi = fileMap.get(toClose);
|
MetaInfo mi = fileMap.get(toClose);
|
||||||
@ -911,9 +911,9 @@ private String getFileNameFromTmpFN(String tmpFileName) {
|
|||||||
return tmpFileName.substring(0, tmpFileName.length()-4);
|
return tmpFileName.substring(0, tmpFileName.length()-4);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setSignalled(boolean isSignalled) {
|
public void setForcejobCompletion(boolean forceJobCompletion) {
|
||||||
this.isSignalled = isSignalled;
|
this.forceJobCompletion = forceJobCompletion;
|
||||||
LOG.info("JobHistoryEventHandler notified that isSignalled was "
|
LOG.info("JobHistoryEventHandler notified that forceJobCompletion is "
|
||||||
+ isSignalled);
|
+ forceJobCompletion);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -170,6 +170,8 @@ public class MRAppMaster extends CompositeService {
|
|||||||
private Credentials fsTokens = new Credentials(); // Filled during init
|
private Credentials fsTokens = new Credentials(); // Filled during init
|
||||||
private UserGroupInformation currentUser; // Will be setup during init
|
private UserGroupInformation currentUser; // Will be setup during init
|
||||||
|
|
||||||
|
private volatile boolean isLastAMRetry = false;
|
||||||
|
|
||||||
public MRAppMaster(ApplicationAttemptId applicationAttemptId,
|
public MRAppMaster(ApplicationAttemptId applicationAttemptId,
|
||||||
ContainerId containerId, String nmHost, int nmPort, int nmHttpPort,
|
ContainerId containerId, String nmHost, int nmPort, int nmHttpPort,
|
||||||
long appSubmitTime) {
|
long appSubmitTime) {
|
||||||
@ -195,11 +197,21 @@ public MRAppMaster(ApplicationAttemptId applicationAttemptId,
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void init(final Configuration conf) {
|
public void init(final Configuration conf) {
|
||||||
|
|
||||||
conf.setBoolean(Dispatcher.DISPATCHER_EXIT_ON_ERROR_KEY, true);
|
conf.setBoolean(Dispatcher.DISPATCHER_EXIT_ON_ERROR_KEY, true);
|
||||||
|
|
||||||
downloadTokensAndSetupUGI(conf);
|
downloadTokensAndSetupUGI(conf);
|
||||||
|
|
||||||
|
//TODO this is a hack, we really need the RM to inform us when we
|
||||||
|
// are the last one. This would allow us to configure retries on
|
||||||
|
// a per application basis.
|
||||||
|
int numAMRetries = conf.getInt(YarnConfiguration.RM_AM_MAX_RETRIES,
|
||||||
|
YarnConfiguration.DEFAULT_RM_AM_MAX_RETRIES);
|
||||||
|
isLastAMRetry = appAttemptID.getAttemptId() >= numAMRetries;
|
||||||
|
LOG.info("AM Retries: " + numAMRetries +
|
||||||
|
" attempt num: " + appAttemptID.getAttemptId() +
|
||||||
|
" is last retry: " + isLastAMRetry);
|
||||||
|
|
||||||
|
|
||||||
context = new RunningAppContext(conf);
|
context = new RunningAppContext(conf);
|
||||||
|
|
||||||
// Job name is the same as the app name util we support DAG of jobs
|
// Job name is the same as the app name util we support DAG of jobs
|
||||||
@ -417,6 +429,8 @@ public void handle(JobFinishEvent event) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
//We are finishing cleanly so this is the last retry
|
||||||
|
isLastAMRetry = true;
|
||||||
// Stop all services
|
// Stop all services
|
||||||
// This will also send the final report to the ResourceManager
|
// This will also send the final report to the ResourceManager
|
||||||
LOG.info("Calling stop for all the services");
|
LOG.info("Calling stop for all the services");
|
||||||
@ -666,7 +680,11 @@ public void handle(ContainerAllocatorEvent event) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void setSignalled(boolean isSignalled) {
|
public void setSignalled(boolean isSignalled) {
|
||||||
((RMCommunicator) containerAllocator).setSignalled(true);
|
((RMCommunicator) containerAllocator).setSignalled(isSignalled);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setShouldUnregister(boolean shouldUnregister) {
|
||||||
|
((RMCommunicator) containerAllocator).setShouldUnregister(shouldUnregister);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -717,7 +735,12 @@ private final class StagingDirCleaningService extends AbstractService {
|
|||||||
@Override
|
@Override
|
||||||
public synchronized void stop() {
|
public synchronized void stop() {
|
||||||
try {
|
try {
|
||||||
cleanupStagingDir();
|
if(isLastAMRetry) {
|
||||||
|
cleanupStagingDir();
|
||||||
|
} else {
|
||||||
|
LOG.info("Skipping cleaning up the staging dir. "
|
||||||
|
+ "assuming AM will be retried.");
|
||||||
|
}
|
||||||
} catch (IOException io) {
|
} catch (IOException io) {
|
||||||
LOG.error("Failed to cleanup staging dir: ", io);
|
LOG.error("Failed to cleanup staging dir: ", io);
|
||||||
}
|
}
|
||||||
@ -1016,14 +1039,19 @@ static class MRAppMasterShutdownHook implements Runnable {
|
|||||||
public void run() {
|
public void run() {
|
||||||
LOG.info("MRAppMaster received a signal. Signaling RMCommunicator and "
|
LOG.info("MRAppMaster received a signal. Signaling RMCommunicator and "
|
||||||
+ "JobHistoryEventHandler.");
|
+ "JobHistoryEventHandler.");
|
||||||
|
|
||||||
// Notify the JHEH and RMCommunicator that a SIGTERM has been received so
|
// Notify the JHEH and RMCommunicator that a SIGTERM has been received so
|
||||||
// that they don't take too long in shutting down
|
// that they don't take too long in shutting down
|
||||||
if(appMaster.containerAllocator instanceof ContainerAllocatorRouter) {
|
if(appMaster.containerAllocator instanceof ContainerAllocatorRouter) {
|
||||||
((ContainerAllocatorRouter) appMaster.containerAllocator)
|
((ContainerAllocatorRouter) appMaster.containerAllocator)
|
||||||
.setSignalled(true);
|
.setSignalled(true);
|
||||||
|
((ContainerAllocatorRouter) appMaster.containerAllocator)
|
||||||
|
.setShouldUnregister(appMaster.isLastAMRetry);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(appMaster.jobHistoryEventHandler != null) {
|
if(appMaster.jobHistoryEventHandler != null) {
|
||||||
appMaster.jobHistoryEventHandler.setSignalled(true);
|
appMaster.jobHistoryEventHandler
|
||||||
|
.setForcejobCompletion(appMaster.isLastAMRetry);
|
||||||
}
|
}
|
||||||
appMaster.stop();
|
appMaster.stop();
|
||||||
}
|
}
|
||||||
|
@ -84,6 +84,7 @@ public abstract class RMCommunicator extends AbstractService {
|
|||||||
private Job job;
|
private Job job;
|
||||||
// Has a signal (SIGTERM etc) been issued?
|
// Has a signal (SIGTERM etc) been issued?
|
||||||
protected volatile boolean isSignalled = false;
|
protected volatile boolean isSignalled = false;
|
||||||
|
private volatile boolean shouldUnregister = true;
|
||||||
|
|
||||||
public RMCommunicator(ClientService clientService, AppContext context) {
|
public RMCommunicator(ClientService clientService, AppContext context) {
|
||||||
super("RMCommunicator");
|
super("RMCommunicator");
|
||||||
@ -213,7 +214,9 @@ public void stop() {
|
|||||||
} catch (InterruptedException ie) {
|
} catch (InterruptedException ie) {
|
||||||
LOG.warn("InterruptedException while stopping", ie);
|
LOG.warn("InterruptedException while stopping", ie);
|
||||||
}
|
}
|
||||||
unregister();
|
if(shouldUnregister) {
|
||||||
|
unregister();
|
||||||
|
}
|
||||||
super.stop();
|
super.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -288,8 +291,15 @@ public AMRMProtocol run() {
|
|||||||
|
|
||||||
protected abstract void heartbeat() throws Exception;
|
protected abstract void heartbeat() throws Exception;
|
||||||
|
|
||||||
|
public void setShouldUnregister(boolean shouldUnregister) {
|
||||||
|
this.shouldUnregister = shouldUnregister;
|
||||||
|
LOG.info("RMCommunicator notified that shouldUnregistered is: "
|
||||||
|
+ shouldUnregister);
|
||||||
|
}
|
||||||
|
|
||||||
public void setSignalled(boolean isSignalled) {
|
public void setSignalled(boolean isSignalled) {
|
||||||
this.isSignalled = isSignalled;
|
this.isSignalled = isSignalled;
|
||||||
LOG.info("RMCommunicator notified that iSignalled was : " + isSignalled);
|
LOG.info("RMCommunicator notified that iSignalled is: "
|
||||||
|
+ isSignalled);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -330,7 +330,7 @@ public void testSigTermedFunctionality() throws IOException {
|
|||||||
Mockito.when(jobId.getAppId()).thenReturn(mockAppId);
|
Mockito.when(jobId.getAppId()).thenReturn(mockAppId);
|
||||||
|
|
||||||
jheh.addToFileMap(jobId);
|
jheh.addToFileMap(jobId);
|
||||||
jheh.setSignalled(true);
|
jheh.setForcejobCompletion(true);
|
||||||
for(int i=0; i < numEvents; ++i) {
|
for(int i=0; i < numEvents; ++i) {
|
||||||
events[i] = getEventToEnqueue(jobId);
|
events[i] = getEventToEnqueue(jobId);
|
||||||
jheh.handle(events[i]);
|
jheh.handle(events[i]);
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
import static org.mockito.Mockito.mock;
|
import static org.mockito.Mockito.mock;
|
||||||
import static org.mockito.Mockito.verify;
|
import static org.mockito.Mockito.verify;
|
||||||
import static org.mockito.Mockito.when;
|
import static org.mockito.Mockito.when;
|
||||||
|
import static org.mockito.Mockito.times;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
@ -47,6 +48,7 @@
|
|||||||
import org.apache.hadoop.yarn.YarnException;
|
import org.apache.hadoop.yarn.YarnException;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.event.EventHandler;
|
import org.apache.hadoop.yarn.event.EventHandler;
|
||||||
import org.apache.hadoop.yarn.factories.RecordFactory;
|
import org.apache.hadoop.yarn.factories.RecordFactory;
|
||||||
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
||||||
@ -89,28 +91,94 @@ public void testDeletionofStaging() throws IOException {
|
|||||||
handler.handle(new JobFinishEvent(jobid));
|
handler.handle(new JobFinishEvent(jobid));
|
||||||
verify(fs).delete(stagingJobPath, true);
|
verify(fs).delete(stagingJobPath, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDeletionofStagingOnKill() throws IOException {
|
||||||
|
conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, stagingJobDir);
|
||||||
|
conf.setInt(YarnConfiguration.RM_AM_MAX_RETRIES, 4);
|
||||||
|
fs = mock(FileSystem.class);
|
||||||
|
when(fs.delete(any(Path.class), anyBoolean())).thenReturn(true);
|
||||||
|
ApplicationAttemptId attemptId = recordFactory.newRecordInstance(
|
||||||
|
ApplicationAttemptId.class);
|
||||||
|
attemptId.setAttemptId(0);
|
||||||
|
ApplicationId appId = recordFactory.newRecordInstance(ApplicationId.class);
|
||||||
|
appId.setClusterTimestamp(System.currentTimeMillis());
|
||||||
|
appId.setId(0);
|
||||||
|
attemptId.setApplicationId(appId);
|
||||||
|
JobId jobid = recordFactory.newRecordInstance(JobId.class);
|
||||||
|
jobid.setAppId(appId);
|
||||||
|
ContainerAllocator mockAlloc = mock(ContainerAllocator.class);
|
||||||
|
MRAppMaster appMaster = new TestMRApp(attemptId, mockAlloc);
|
||||||
|
appMaster.init(conf);
|
||||||
|
//simulate the process being killed
|
||||||
|
MRAppMaster.MRAppMasterShutdownHook hook =
|
||||||
|
new MRAppMaster.MRAppMasterShutdownHook(appMaster);
|
||||||
|
hook.run();
|
||||||
|
verify(fs, times(0)).delete(stagingJobPath, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDeletionofStagingOnKillLastTry() throws IOException {
|
||||||
|
conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, stagingJobDir);
|
||||||
|
conf.setInt(YarnConfiguration.RM_AM_MAX_RETRIES, 1);
|
||||||
|
fs = mock(FileSystem.class);
|
||||||
|
when(fs.delete(any(Path.class), anyBoolean())).thenReturn(true);
|
||||||
|
ApplicationAttemptId attemptId = recordFactory.newRecordInstance(
|
||||||
|
ApplicationAttemptId.class);
|
||||||
|
attemptId.setAttemptId(1);
|
||||||
|
ApplicationId appId = recordFactory.newRecordInstance(ApplicationId.class);
|
||||||
|
appId.setClusterTimestamp(System.currentTimeMillis());
|
||||||
|
appId.setId(0);
|
||||||
|
attemptId.setApplicationId(appId);
|
||||||
|
JobId jobid = recordFactory.newRecordInstance(JobId.class);
|
||||||
|
jobid.setAppId(appId);
|
||||||
|
ContainerAllocator mockAlloc = mock(ContainerAllocator.class);
|
||||||
|
MRAppMaster appMaster = new TestMRApp(attemptId, mockAlloc);
|
||||||
|
appMaster.init(conf);
|
||||||
|
//simulate the process being killed
|
||||||
|
MRAppMaster.MRAppMasterShutdownHook hook =
|
||||||
|
new MRAppMaster.MRAppMasterShutdownHook(appMaster);
|
||||||
|
hook.run();
|
||||||
|
verify(fs).delete(stagingJobPath, true);
|
||||||
|
}
|
||||||
|
|
||||||
private class TestMRApp extends MRAppMaster {
|
private class TestMRApp extends MRAppMaster {
|
||||||
|
ContainerAllocator allocator;
|
||||||
|
|
||||||
public TestMRApp(ApplicationAttemptId applicationAttemptId) {
|
public TestMRApp(ApplicationAttemptId applicationAttemptId,
|
||||||
super(applicationAttemptId, BuilderUtils.newContainerId(
|
ContainerAllocator allocator) {
|
||||||
applicationAttemptId, 1), "testhost", 2222, 3333, System
|
super(applicationAttemptId, BuilderUtils.newContainerId(
|
||||||
.currentTimeMillis());
|
applicationAttemptId, 1), "testhost", 2222, 3333, System
|
||||||
}
|
.currentTimeMillis());
|
||||||
|
this.allocator = allocator;
|
||||||
@Override
|
}
|
||||||
protected FileSystem getFileSystem(Configuration conf) {
|
|
||||||
return fs;
|
public TestMRApp(ApplicationAttemptId applicationAttemptId) {
|
||||||
}
|
this(applicationAttemptId, null);
|
||||||
|
}
|
||||||
@Override
|
|
||||||
protected void sysexit() {
|
@Override
|
||||||
}
|
protected FileSystem getFileSystem(Configuration conf) {
|
||||||
|
return fs;
|
||||||
@Override
|
}
|
||||||
public Configuration getConfig() {
|
|
||||||
return conf;
|
@Override
|
||||||
}
|
protected ContainerAllocator createContainerAllocator(
|
||||||
|
final ClientService clientService, final AppContext context) {
|
||||||
|
if(allocator == null) {
|
||||||
|
return super.createContainerAllocator(clientService, context);
|
||||||
|
}
|
||||||
|
return allocator;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void sysexit() {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Configuration getConfig() {
|
||||||
|
return conf;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private final class MRAppTestCleanup extends MRApp {
|
private final class MRAppTestCleanup extends MRApp {
|
||||||
|
Loading…
Reference in New Issue
Block a user