YARN-8265. Improve DNS handling on docker IP changes.

Contributed by Billie Rinaldi
This commit is contained in:
Eric Yang 2018-05-11 22:37:43 -07:00
parent 6c8e51ca7e
commit 0ff94563b9
4 changed files with 173 additions and 11 deletions

View File

@ -39,6 +39,7 @@
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.service.ServiceScheduler;
import org.apache.hadoop.yarn.service.api.records.Artifact;
import org.apache.hadoop.yarn.service.api.records.ContainerState;
import org.apache.hadoop.yarn.service.component.Component;
import org.apache.hadoop.yarn.service.component.ComponentEvent;
@ -151,10 +152,19 @@ private static class ContainerStartedTransition extends BaseTransition {
@Override public void transition(ComponentInstance compInstance,
ComponentInstanceEvent event) {
// Query container status for ip and host
boolean cancelOnSuccess = true;
if (compInstance.getCompSpec().getArtifact() != null && compInstance
.getCompSpec().getArtifact().getType() == Artifact.TypeEnum.DOCKER) {
// A docker container might get a different IP if the container is
// relaunched by the NM, so we need to keep checking the status.
// This is a temporary fix until the NM provides a callback for
// container relaunch (see YARN-8265).
cancelOnSuccess = false;
}
compInstance.containerStatusFuture =
compInstance.scheduler.executorService.scheduleAtFixedRate(
new ContainerStatusRetriever(compInstance.scheduler,
event.getContainerId(), compInstance), 0, 1,
event.getContainerId(), compInstance, cancelOnSuccess), 0, 1,
TimeUnit.SECONDS);
long containerStartTime = System.currentTimeMillis();
try {
@ -373,14 +383,26 @@ public void updateContainerStatus(ContainerStatus status) {
this.status = status;
org.apache.hadoop.yarn.service.api.records.Container container =
getCompSpec().getContainer(status.getContainerId().toString());
boolean doRegistryUpdate = true;
if (container != null) {
container.setIp(StringUtils.join(",", status.getIPs()));
String existingIP = container.getIp();
String newIP = StringUtils.join(",", status.getIPs());
container.setIp(newIP);
container.setHostname(status.getHost());
if (timelineServiceEnabled) {
if (existingIP != null && newIP.equals(existingIP)) {
doRegistryUpdate = false;
}
if (timelineServiceEnabled && doRegistryUpdate) {
serviceTimelinePublisher.componentInstanceIPHostUpdated(container);
}
}
updateServiceRecord(yarnRegistryOperations, status);
if (doRegistryUpdate) {
cleanupRegistry(status.getContainerId());
LOG.info(
getCompInstanceId() + " new IP = " + status.getIPs() + ", host = "
+ status.getHost() + ", updating registry");
updateServiceRecord(yarnRegistryOperations, status);
}
}
public String getCompName() {
@ -522,12 +544,15 @@ private static class ContainerStatusRetriever implements Runnable {
private NodeId nodeId;
private NMClient nmClient;
private ComponentInstance instance;
private boolean cancelOnSuccess;
ContainerStatusRetriever(ServiceScheduler scheduler,
ContainerId containerId, ComponentInstance instance) {
ContainerId containerId, ComponentInstance instance, boolean
cancelOnSuccess) {
this.containerId = containerId;
this.nodeId = instance.getNodeId();
this.nmClient = scheduler.getNmClient().getClient();
this.instance = instance;
this.cancelOnSuccess = cancelOnSuccess;
}
@Override public void run() {
ContainerStatus status = null;
@ -548,10 +573,12 @@ private static class ContainerStatusRetriever implements Runnable {
return;
}
instance.updateContainerStatus(status);
LOG.info(
instance.compInstanceId + " IP = " + status.getIPs() + ", host = "
+ status.getHost() + ", cancel container status retriever");
instance.containerStatusFuture.cancel(false);
if (cancelOnSuccess) {
LOG.info(
instance.compInstanceId + " IP = " + status.getIPs() + ", host = "
+ status.getHost() + ", cancel container status retriever");
instance.containerStatusFuture.cancel(false);
}
}
}

View File

@ -317,6 +317,14 @@ public void feedFailedContainerToComp(Service service, int id, String
}
}
public Container updateContainerStatus(Service service, int id,
String compName, String host) {
ContainerId containerId = createContainerId(id);
Container container = createContainer(containerId, compName);
addContainerStatus(container, ContainerState.RUNNING, host);
return container;
}
public ContainerId createContainerId(int id) {
ApplicationId applicationId = ApplicationId.fromString(service.getId());
return ContainerId.newContainerId(
@ -389,10 +397,15 @@ public Boolean get() {
}
private void addContainerStatus(Container container, ContainerState state) {
addContainerStatus(container, state, container.getNodeId().getHost());
}
private void addContainerStatus(Container container, ContainerState state,
String host) {
ContainerStatus status = ContainerStatus.newInstance(container.getId(),
state, "", 0);
status.setHost(container.getNodeId().getHost());
status.setIPs(Lists.newArrayList(container.getNodeId().getHost()));
status.setHost(host);
status.setIPs(Lists.newArrayList(host));
containerStatuses.put(container.getId(), status);
}

View File

@ -34,6 +34,7 @@
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.security.DockerCredentialTokenIdentifier;
import org.apache.hadoop.yarn.service.api.records.Artifact;
import org.apache.hadoop.yarn.service.api.records.Component;
import org.apache.hadoop.yarn.service.api.records.ResourceInformation;
import org.apache.hadoop.yarn.service.api.records.Service;
@ -349,4 +350,45 @@ public void testRecordTokensForContainers() throws Exception {
am.stop();
}
@Test
public void testIPChange() throws TimeoutException,
InterruptedException {
ApplicationId applicationId = ApplicationId.newInstance(123456, 1);
String comp1Name = "comp1";
String comp1InstName = "comp1-0";
Service exampleApp = new Service();
exampleApp.setId(applicationId.toString());
exampleApp.setVersion("v1");
exampleApp.setName("testIPChange");
Component comp1 = createComponent(comp1Name, 1, "sleep 60");
comp1.setArtifact(new Artifact().type(Artifact.TypeEnum.DOCKER));
exampleApp.addComponent(comp1);
MockServiceAM am = new MockServiceAM(exampleApp);
am.init(conf);
am.start();
ComponentInstance comp1inst0 = am.getCompInstance(comp1Name, comp1InstName);
// allocate a container
am.feedContainerToComp(exampleApp, 1, comp1Name);
GenericTestUtils.waitFor(() -> comp1inst0.getContainerStatus() != null,
2000, 200000);
// first host status will match the container nodeId
Assert.assertEquals("localhost",
comp1inst0.getContainerStatus().getHost());
LOG.info("Change the IP and host");
// change the container status
am.updateContainerStatus(exampleApp, 1, comp1Name, "new.host");
GenericTestUtils.waitFor(() -> comp1inst0.getContainerStatus().getHost()
.equals("new.host"), 2000, 200000);
LOG.info("Change the IP and host again");
// change the container status
am.updateContainerStatus(exampleApp, 1, comp1Name, "newer.host");
GenericTestUtils.waitFor(() -> comp1inst0.getContainerStatus().getHost()
.equals("newer.host"), 2000, 200000);
am.stop();
}
}

View File

@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.io.File;
import static org.junit.Assert.assertTrue;
import static org.mockito.Mockito.doReturn;
import static org.mockito.Mockito.mock;
/** Unit tests for DockerClient. */
public class TestDockerClient {
private static final File TEST_ROOT_DIR = GenericTestUtils.getTestDir(
TestDockerClient.class.getName());
@Before
public void setup() {
TEST_ROOT_DIR.mkdirs();
}
@After
public void cleanup() {
FileUtil.fullyDelete(TEST_ROOT_DIR);
}
@Test
public void testWriteCommandToTempFile() throws Exception {
String absRoot = TEST_ROOT_DIR.getAbsolutePath();
ApplicationId appId = ApplicationId.newInstance(1, 1);
ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
ContainerId cid = ContainerId.newContainerId(attemptId, 1);
DockerCommand dockerCmd = new DockerInspectCommand(cid.toString());
Configuration conf = new Configuration();
conf.set("hadoop.tmp.dir", absRoot);
conf.set(YarnConfiguration.NM_LOCAL_DIRS, absRoot);
conf.set(YarnConfiguration.NM_LOG_DIRS, absRoot);
LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
Context mockContext = mock(Context.class);
doReturn(conf).when(mockContext).getConf();
doReturn(dirsHandler).when(mockContext).getLocalDirsHandler();
DockerClient dockerClient = new DockerClient(conf);
dirsHandler.init(conf);
dirsHandler.start();
String tmpPath = dockerClient.writeCommandToTempFile(dockerCmd, cid,
mockContext);
dirsHandler.stop();
File tmpFile = new File(tmpPath);
assertTrue(tmpFile + " was not created", tmpFile.exists());
}
}