YARN-755. Renamed AllocateResponse.reboot to AllocateResponse.resync. Contributed by Bikas Saha.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1489295 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
28aabe0b2b
commit
978012b9b6
@ -116,7 +116,7 @@ protected synchronized void heartbeat() throws Exception {
|
||||
// continue to attempt to contact the RM.
|
||||
throw e;
|
||||
}
|
||||
if (allocateResponse.getReboot()) {
|
||||
if (allocateResponse.getResync()) {
|
||||
LOG.info("Event from RM: shutting down Application Master");
|
||||
// This can happen if the RM has been restarted. If it is in that state,
|
||||
// this application must clean itself up.
|
||||
|
@ -570,7 +570,7 @@ private List<Container> getResources() throws Exception {
|
||||
// continue to attempt to contact the RM.
|
||||
throw e;
|
||||
}
|
||||
if (response.getReboot()) {
|
||||
if (response.getResync()) {
|
||||
// This can happen if the RM has been restarted. If it is in that state,
|
||||
// this application must clean itself up.
|
||||
eventHandler.handle(new JobEvent(this.getJob().getID(),
|
||||
|
@ -93,8 +93,8 @@ Release 2.1.0-beta - UNRELEASED
|
||||
YARN-635. Renamed YarnRemoteException to YarnException. (Siddharth Seth via
|
||||
vinodkv)
|
||||
|
||||
YARN-756. Move Preemption* records to yarn.api where they really belong.
|
||||
(Jian He via vinodkv)
|
||||
YARN-755. Renamed AllocateResponse.reboot to AllocateResponse.resync. (Bikas
|
||||
Saha via vinodkv)
|
||||
|
||||
NEW FEATURES
|
||||
|
||||
@ -265,6 +265,9 @@ Release 2.1.0-beta - UNRELEASED
|
||||
YARN-717. Put object creation factories for Token in the class itself and
|
||||
remove useless derivations for specific tokens. (Jian He via vinodkv)
|
||||
|
||||
YARN-756. Move Preemption* records to yarn.api where they really belong.
|
||||
(Jian He via vinodkv)
|
||||
|
||||
OPTIMIZATIONS
|
||||
|
||||
YARN-512. Log aggregation root directory check is more expensive than it
|
||||
|
@ -65,7 +65,7 @@ public abstract class AllocateResponse {
|
||||
public static AllocateResponse newInstance(int responseId,
|
||||
List<ContainerStatus> completedContainers,
|
||||
List<Container> allocatedContainers, List<NodeReport> updatedNodes,
|
||||
Resource availResources, boolean reboot, int numClusterNodes,
|
||||
Resource availResources, boolean resync, int numClusterNodes,
|
||||
PreemptionMessage preempt) {
|
||||
AllocateResponse response = Records.newRecord(AllocateResponse.class);
|
||||
response.setNumClusterNodes(numClusterNodes);
|
||||
@ -74,26 +74,32 @@ public static AllocateResponse newInstance(int responseId,
|
||||
response.setAllocatedContainers(allocatedContainers);
|
||||
response.setUpdatedNodes(updatedNodes);
|
||||
response.setAvailableResources(availResources);
|
||||
response.setReboot(reboot);
|
||||
response.setResync(resync);
|
||||
response.setPreemptionMessage(preempt);
|
||||
return response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should the <code>ApplicationMaster</code> reboot for being horribly
|
||||
* Should the <code>ApplicationMaster</code> take action because of being
|
||||
* out-of-sync with the <code>ResourceManager</code> as deigned by
|
||||
* {@link #getResponseId()}?
|
||||
* {@link #getResponseId()}
|
||||
* This can be due to application errors or because the ResourceManager
|
||||
* has restarted. The action to be taken by the <code>ApplicationMaster</code>
|
||||
* is to shutdown without unregistering with the <code>ResourceManager</code>.
|
||||
* The ResourceManager will start a new attempt. If the application is already
|
||||
* done when it gets the resync command, then it may choose to shutdown after
|
||||
* unregistering in which case the ResourceManager will not start a new attempt.
|
||||
*
|
||||
* @return <code>true</code> if the <code>ApplicationMaster</code> should
|
||||
* reboot, <code>false</code> otherwise
|
||||
* take action, <code>false</code> otherwise
|
||||
*/
|
||||
@Public
|
||||
@Stable
|
||||
public abstract boolean getReboot();
|
||||
public abstract boolean getResync();
|
||||
|
||||
@Private
|
||||
@Unstable
|
||||
public abstract void setReboot(boolean reboot);
|
||||
public abstract void setResync(boolean value);
|
||||
|
||||
/**
|
||||
* Get the <em>last response id</em>.
|
||||
|
@ -145,15 +145,15 @@ private synchronized void maybeInitBuilder() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized boolean getReboot() {
|
||||
public synchronized boolean getResync() {
|
||||
AllocateResponseProtoOrBuilder p = viaProto ? proto : builder;
|
||||
return (p.getReboot());
|
||||
return (p.getResync());
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void setReboot(boolean reboot) {
|
||||
public synchronized void setResync(boolean resync) {
|
||||
maybeInitBuilder();
|
||||
builder.setReboot((reboot));
|
||||
builder.setResync((resync));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -59,7 +59,7 @@ message AllocateRequestProto {
|
||||
}
|
||||
|
||||
message AllocateResponseProto {
|
||||
optional bool reboot = 1;
|
||||
optional bool resync = 1;
|
||||
optional int32 response_id = 2;
|
||||
repeated ContainerProto allocated_containers = 3;
|
||||
repeated ContainerStatusProto completed_container_statuses = 4;
|
||||
|
@ -331,7 +331,7 @@ public void run() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (response.getReboot()) {
|
||||
if (response.getResync()) {
|
||||
handler.onRebootRequest();
|
||||
LOG.info("Reboot requested. Stopping callback.");
|
||||
break;
|
||||
|
@ -189,7 +189,7 @@ public void testAMRMClientAsyncReboot() throws Exception {
|
||||
|
||||
final AllocateResponse rebootResponse = createAllocateResponse(
|
||||
new ArrayList<ContainerStatus>(), new ArrayList<Container>());
|
||||
rebootResponse.setReboot(true);
|
||||
rebootResponse.setResync(true);
|
||||
when(client.allocate(anyFloat())).thenReturn(rebootResponse);
|
||||
|
||||
AMRMClientAsync<ContainerRequest> asyncClient =
|
||||
|
@ -411,7 +411,7 @@ public static AllocateResponse newAllocateResponse(int responseId,
|
||||
response.setAllocatedContainers(allocatedContainers);
|
||||
response.setUpdatedNodes(updatedNodes);
|
||||
response.setAvailableResources(availResources);
|
||||
response.setReboot(reboot);
|
||||
response.setResync(reboot);
|
||||
response.setPreemptionMessage(preempt);
|
||||
|
||||
return response;
|
||||
|
@ -90,7 +90,7 @@ public class ApplicationMasterService extends AbstractService implements
|
||||
RecordFactoryProvider.getRecordFactory(null);
|
||||
private final ConcurrentMap<ApplicationAttemptId, AllocateResponse> responseMap =
|
||||
new ConcurrentHashMap<ApplicationAttemptId, AllocateResponse>();
|
||||
private final AllocateResponse reboot =
|
||||
private final AllocateResponse resync =
|
||||
recordFactory.newRecordInstance(AllocateResponse.class);
|
||||
private final RMContext rmContext;
|
||||
|
||||
@ -98,7 +98,7 @@ public ApplicationMasterService(RMContext rmContext, YarnScheduler scheduler) {
|
||||
super(ApplicationMasterService.class.getName());
|
||||
this.amLivelinessMonitor = rmContext.getAMLivelinessMonitor();
|
||||
this.rScheduler = scheduler;
|
||||
this.reboot.setReboot(true);
|
||||
this.resync.setResync(true);
|
||||
// this.reboot.containers = new ArrayList<Container>();
|
||||
this.rmContext = rmContext;
|
||||
}
|
||||
@ -263,7 +263,7 @@ public AllocateResponse allocate(AllocateRequest request)
|
||||
AllocateResponse lastResponse = responseMap.get(appAttemptId);
|
||||
if (lastResponse == null) {
|
||||
LOG.error("AppAttemptId doesnt exist in cache " + appAttemptId);
|
||||
return reboot;
|
||||
return resync;
|
||||
}
|
||||
if ((request.getResponseId() + 1) == lastResponse.getResponseId()) {
|
||||
/* old heartbeat */
|
||||
@ -273,7 +273,7 @@ public AllocateResponse allocate(AllocateRequest request)
|
||||
// Oh damn! Sending reboot isn't enough. RM state is corrupted. TODO:
|
||||
// Reboot is not useful since after AM reboots, it will send register and
|
||||
// get an exception. Might as well throw an exception here.
|
||||
return reboot;
|
||||
return resync;
|
||||
}
|
||||
|
||||
// Allow only one thread in AM to do heartbeat at a time.
|
||||
@ -344,7 +344,7 @@ public AllocateResponse allocate(AllocateRequest request)
|
||||
String message = "App Attempt removed from the cache during allocate"
|
||||
+ appAttemptId;
|
||||
LOG.error(message);
|
||||
return reboot;
|
||||
return resync;
|
||||
}
|
||||
|
||||
allocateResponse.setNumClusterNodes(this.rScheduler.getNumClusterNodes());
|
||||
|
@ -250,7 +250,7 @@ public void testRMRestart() throws Exception {
|
||||
AllocateResponse allocResponse = am1.allocate(
|
||||
new ArrayList<ResourceRequest>(),
|
||||
new ArrayList<ContainerId>());
|
||||
Assert.assertTrue(allocResponse.getReboot());
|
||||
Assert.assertTrue(allocResponse.getResync());
|
||||
|
||||
// NM should be rebooted on heartbeat, even first heartbeat for nm2
|
||||
NodeHeartbeatResponse hbResponse = nm1.nodeHeartbeat(true);
|
||||
|
@ -82,7 +82,7 @@ public void testARRMResponseId() throws Exception {
|
||||
|
||||
AllocateResponse response = amService.allocate(allocateRequest);
|
||||
Assert.assertEquals(1, response.getResponseId());
|
||||
Assert.assertFalse(response.getReboot());
|
||||
Assert.assertFalse(response.getResync());
|
||||
allocateRequest = AllocateRequest.newInstance(attempt
|
||||
.getAppAttemptId(), response.getResponseId(), 0F, null, null);
|
||||
|
||||
@ -96,6 +96,6 @@ public void testARRMResponseId() throws Exception {
|
||||
allocateRequest = AllocateRequest.newInstance(attempt
|
||||
.getAppAttemptId(), 0, 0F, null, null);
|
||||
response = amService.allocate(allocateRequest);
|
||||
Assert.assertTrue(response.getReboot());
|
||||
Assert.assertTrue(response.getResync());
|
||||
}
|
||||
}
|
||||
|
@ -208,7 +208,7 @@ public void testMasterKeyRollOver() throws Exception {
|
||||
AllocateRequest allocateRequest =
|
||||
Records.newRecord(AllocateRequest.class);
|
||||
allocateRequest.setApplicationAttemptId(applicationAttemptId);
|
||||
Assert.assertFalse(rmClient.allocate(allocateRequest).getReboot());
|
||||
Assert.assertFalse(rmClient.allocate(allocateRequest).getResync());
|
||||
|
||||
// Simulate a master-key-roll-over
|
||||
ApplicationTokenSecretManager appTokenSecretManager =
|
||||
@ -224,7 +224,7 @@ public void testMasterKeyRollOver() throws Exception {
|
||||
rmClient = createRMClient(rm, conf, rpc, currentUser);
|
||||
allocateRequest = Records.newRecord(AllocateRequest.class);
|
||||
allocateRequest.setApplicationAttemptId(applicationAttemptId);
|
||||
Assert.assertFalse(rmClient.allocate(allocateRequest).getReboot());
|
||||
Assert.assertFalse(rmClient.allocate(allocateRequest).getResync());
|
||||
} finally {
|
||||
rm.stop();
|
||||
if (rmClient != null) {
|
||||
|
Loading…
Reference in New Issue
Block a user