YARN-9084. Reset container state and defer readiness check for upgrade.

Contributed by Chandni Singh
This commit is contained in:
Eric Yang 2018-12-18 18:02:03 -05:00
parent c7a5a4435e
commit ccdd982e51
3 changed files with 41 additions and 23 deletions

View File

@ -68,12 +68,6 @@ public List<Component> findTargetComponentSpecs(Service currentDef,
"not supported by upgrade");
}
if (!Objects.equals(currentDef.getQuicklinks(),
targetDef.getQuicklinks())) {
throw new UnsupportedOperationException("changes to quick links " +
"not supported by upgrade");
}
if (!Objects.equals(currentDef.getLaunchTime(),
targetDef.getLaunchTime())) {
throw new UnsupportedOperationException("changes to launch time " +

View File

@ -47,6 +47,7 @@
import org.apache.hadoop.yarn.service.component.ComponentEvent;
import org.apache.hadoop.yarn.service.component.ComponentEventType;
import org.apache.hadoop.yarn.service.component.ComponentRestartPolicy;
import org.apache.hadoop.yarn.service.monitor.probe.DefaultProbe;
import org.apache.hadoop.yarn.service.monitor.probe.ProbeStatus;
import org.apache.hadoop.yarn.service.registry.YarnRegistryViewForProviders;
import org.apache.hadoop.yarn.service.timelineservice.ServiceTimelinePublisher;
@ -186,7 +187,7 @@ private static class ContainerStartedTransition extends BaseTransition {
@Override public void transition(ComponentInstance compInstance,
ComponentInstanceEvent event) {
// Query container status for ip and host
compInstance.initializeStatusRetriever(event);
compInstance.initializeStatusRetriever(event, 0);
long containerStartTime = System.currentTimeMillis();
try {
ContainerTokenIdentifier containerTokenIdentifier = BuilderUtils
@ -266,7 +267,12 @@ public ComponentInstanceState transition(ComponentInstance instance,
instance.upgradeInProgress.set(false);
instance.setContainerState(ContainerState.RUNNING_BUT_UNREADY);
instance.initializeStatusRetriever(event);
if (instance.component.getProbe() != null &&
instance.component.getProbe() instanceof DefaultProbe) {
instance.initializeStatusRetriever(event, 30);
} else {
instance.initializeStatusRetriever(event, 0);
}
Component.UpgradeStatus status = instance.getState().equals(UPGRADING) ?
instance.component.getUpgradeStatus() :
@ -625,7 +631,7 @@ private void cancelUpgrade() {
private void reInitHelper(Component.UpgradeStatus upgradeStatus) {
cancelContainerStatusRetriever();
setContainerStatus(null);
setContainerStatus(container.getId(), null);
scheduler.executorService.submit(() -> cleanupRegistry(container.getId()));
scheduler.getContainerLaunchService()
.reInitCompInstance(scheduler.getApp(), this,
@ -634,7 +640,8 @@ private void reInitHelper(Component.UpgradeStatus upgradeStatus) {
upgradeStatus.getTargetVersion()));
}
private void initializeStatusRetriever(ComponentInstanceEvent event) {
private void initializeStatusRetriever(ComponentInstanceEvent event,
long initialDelay) {
boolean cancelOnSuccess = true;
if (getCompSpec().getArtifact() != null &&
getCompSpec().getArtifact().getType() == Artifact.TypeEnum.DOCKER) {
@ -644,10 +651,11 @@ private void initializeStatusRetriever(ComponentInstanceEvent event) {
// container relaunch (see YARN-8265).
cancelOnSuccess = false;
}
LOG.info("{} retrieve status after {}", compInstanceId, initialDelay);
containerStatusFuture =
scheduler.executorService.scheduleAtFixedRate(
new ContainerStatusRetriever(scheduler, event.getContainerId(),
this, cancelOnSuccess), 0, 1,
this, cancelOnSuccess), initialDelay, 1,
TimeUnit.SECONDS);
}
@ -743,32 +751,44 @@ public ContainerStatus getContainerStatus() {
}
}
private void setContainerStatus(ContainerStatus latestStatus) {
private void setContainerStatus(ContainerId containerId,
ContainerStatus latestStatus) {
try {
writeLock.lock();
this.status = latestStatus;
org.apache.hadoop.yarn.service.api.records.Container containerRec =
getCompSpec().getContainer(containerId.toString());
if (containerRec != null) {
if (latestStatus != null) {
containerRec.setIp(StringUtils.join(",", latestStatus.getIPs()));
containerRec.setHostname(latestStatus.getHost());
} else {
containerRec.setIp(null);
containerRec.setHostname(null);
}
}
} finally {
writeLock.unlock();
}
}
public void updateContainerStatus(ContainerStatus status) {
setContainerStatus(status);
org.apache.hadoop.yarn.service.api.records.Container container =
org.apache.hadoop.yarn.service.api.records.Container containerRec =
getCompSpec().getContainer(status.getContainerId().toString());
boolean doRegistryUpdate = true;
if (container != null) {
String existingIP = container.getIp();
if (containerRec != null) {
String existingIP = containerRec.getIp();
String newIP = StringUtils.join(",", status.getIPs());
container.setIp(newIP);
container.setHostname(status.getHost());
if (existingIP != null && newIP.equals(existingIP)) {
doRegistryUpdate = false;
}
if (timelineServiceEnabled && doRegistryUpdate) {
serviceTimelinePublisher.componentInstanceIPHostUpdated(container);
}
setContainerStatus(status.getContainerId(), status);
if (containerRec != null && timelineServiceEnabled && doRegistryUpdate) {
serviceTimelinePublisher.componentInstanceIPHostUpdated(containerRec);
}
if (doRegistryUpdate) {
cleanupRegistry(status.getContainerId());
LOG.info(

View File

@ -363,8 +363,12 @@ public void testRecoverComponentsAfterRMRestart() throws Exception {
Multimap<String, String> containersAfterFailure = waitForAllCompToBeReady(
client, exampleApp);
Assert.assertEquals("component container affected by restart",
containersBeforeFailure, containersAfterFailure);
containersBeforeFailure.keys().forEach(compName -> {
Assert.assertEquals("num containers after by restart for " + compName,
containersBeforeFailure.get(compName).size(),
containersAfterFailure.get(compName) == null ? 0 :
containersAfterFailure.get(compName).size());
});
LOG.info("Stop/destroy service {}", exampleApp);
client.actionStop(exampleApp.getName(), true);