YARN-2809. Implement workaround for linux kernel panic when removing cgroup. Contributed by Nathan Roberts
This commit is contained in:
parent
4eb5f7fa32
commit
3f5431a22f
@ -530,6 +530,9 @@ Release 2.7.0 - UNRELEASED
|
||||
YARN-3090. DeletionService can silently ignore deletion task failures
|
||||
(Varun Saxena via jlowe)
|
||||
|
||||
YARN-2809. Implement workaround for linux kernel panic when removing
|
||||
cgroup (Nathan Roberts via jlowe)
|
||||
|
||||
Release 2.6.0 - 2014-11-18
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
@ -1003,6 +1003,15 @@ private static void addDeprecatedKeys() {
|
||||
public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT =
|
||||
1000;
|
||||
|
||||
/**
|
||||
* Delay between attempts to remove linux cgroup.
|
||||
*/
|
||||
public static final String NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY =
|
||||
NM_PREFIX + "linux-container-executor.cgroups.delete-delay-ms";
|
||||
|
||||
public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY =
|
||||
20;
|
||||
|
||||
/**
|
||||
/* The Windows group that the windows-secure-container-executor should run as.
|
||||
*/
|
||||
|
@ -22,6 +22,7 @@
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStreamWriter;
|
||||
@ -37,6 +38,7 @@
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
@ -75,6 +77,7 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
||||
private final Map<String, String> controllerPaths; // Controller -> path
|
||||
|
||||
private long deleteCgroupTimeout;
|
||||
private long deleteCgroupDelay;
|
||||
// package private for testing purposes
|
||||
Clock clock;
|
||||
|
||||
@ -108,6 +111,9 @@ void initConfig() throws IOException {
|
||||
this.deleteCgroupTimeout = conf.getLong(
|
||||
YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT,
|
||||
YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT);
|
||||
this.deleteCgroupDelay =
|
||||
conf.getLong(YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY,
|
||||
YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY);
|
||||
// remove extra /'s at end or start of cgroupPrefix
|
||||
if (cgroupPrefix.charAt(0) == '/') {
|
||||
cgroupPrefix = cgroupPrefix.substring(1);
|
||||
@ -271,31 +277,78 @@ private void updateCgroup(String controller, String groupName, String param,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Utility routine to print first line from cgroup tasks file
|
||||
*/
|
||||
private void logLineFromTasksFile(File cgf) {
|
||||
String str;
|
||||
if (LOG.isDebugEnabled()) {
|
||||
try (BufferedReader inl =
|
||||
new BufferedReader(new InputStreamReader(new FileInputStream(cgf
|
||||
+ "/tasks"), "UTF-8"))) {
|
||||
if ((str = inl.readLine()) != null) {
|
||||
LOG.debug("First line in cgroup tasks file: " + cgf + " " + str);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Failed to read cgroup tasks file. ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* If tasks file is empty, delete the cgroup.
|
||||
*
|
||||
* @param file object referring to the cgroup to be deleted
|
||||
* @return Boolean indicating whether cgroup was deleted
|
||||
*/
|
||||
@VisibleForTesting
|
||||
boolean checkAndDeleteCgroup(File cgf) throws InterruptedException {
|
||||
boolean deleted = false;
|
||||
// FileInputStream in = null;
|
||||
try (FileInputStream in = new FileInputStream(cgf + "/tasks")) {
|
||||
if (in.read() == -1) {
|
||||
/*
|
||||
* "tasks" file is empty, sleep a bit more and then try to delete the
|
||||
* cgroup. Some versions of linux will occasionally panic due to a race
|
||||
* condition in this area, hence the paranoia.
|
||||
*/
|
||||
Thread.sleep(deleteCgroupDelay);
|
||||
deleted = cgf.delete();
|
||||
if (!deleted) {
|
||||
LOG.warn("Failed attempt to delete cgroup: " + cgf);
|
||||
}
|
||||
} else {
|
||||
logLineFromTasksFile(cgf);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Failed to read cgroup tasks file. ", e);
|
||||
}
|
||||
return deleted;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
boolean deleteCgroup(String cgroupPath) {
|
||||
boolean deleted;
|
||||
boolean deleted = false;
|
||||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("deleteCgroup: " + cgroupPath);
|
||||
}
|
||||
|
||||
long start = clock.getTime();
|
||||
do {
|
||||
deleted = new File(cgroupPath).delete();
|
||||
if (!deleted) {
|
||||
try {
|
||||
Thread.sleep(20);
|
||||
deleted = checkAndDeleteCgroup(new File(cgroupPath));
|
||||
if (!deleted) {
|
||||
Thread.sleep(deleteCgroupDelay);
|
||||
}
|
||||
} catch (InterruptedException ex) {
|
||||
// NOP
|
||||
}
|
||||
}
|
||||
} while (!deleted && (clock.getTime() - start) < deleteCgroupTimeout);
|
||||
|
||||
if (!deleted) {
|
||||
LOG.warn("Unable to delete cgroup at: " + cgroupPath +
|
||||
", tried to delete for " + deleteCgroupTimeout + "ms");
|
||||
}
|
||||
|
||||
return deleted;
|
||||
}
|
||||
|
||||
|
@ -26,6 +26,8 @@
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.util.Clock;
|
||||
import org.junit.Test;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.*;
|
||||
@ -35,6 +37,7 @@
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
|
||||
public class TestCgroupsLCEResourcesHandler {
|
||||
static File cgroupDir = null;
|
||||
|
||||
static class MockClock implements Clock {
|
||||
long time;
|
||||
@ -43,6 +46,51 @@ public long getTime() {
|
||||
return time;
|
||||
}
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
cgroupDir =
|
||||
new File(System.getProperty("test.build.data",
|
||||
System.getProperty("java.io.tmpdir", "target")), this.getClass()
|
||||
.getName());
|
||||
FileUtils.deleteQuietly(cgroupDir);
|
||||
}
|
||||
|
||||
@After
|
||||
public void tearDown() throws Exception {
|
||||
FileUtils.deleteQuietly(cgroupDir);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testcheckAndDeleteCgroup() throws Exception {
|
||||
CgroupsLCEResourcesHandler handler = new CgroupsLCEResourcesHandler();
|
||||
handler.setConf(new YarnConfiguration());
|
||||
handler.initConfig();
|
||||
|
||||
FileUtils.deleteQuietly(cgroupDir);
|
||||
// Test 0
|
||||
// tasks file not present, should return false
|
||||
Assert.assertFalse(handler.checkAndDeleteCgroup(cgroupDir));
|
||||
|
||||
File tfile = new File(cgroupDir.getAbsolutePath(), "tasks");
|
||||
FileOutputStream fos = FileUtils.openOutputStream(tfile);
|
||||
File fspy = Mockito.spy(cgroupDir);
|
||||
|
||||
// Test 1, tasks file is empty
|
||||
// tasks file has no data, should return true
|
||||
Mockito.stub(fspy.delete()).toReturn(true);
|
||||
Assert.assertTrue(handler.checkAndDeleteCgroup(fspy));
|
||||
|
||||
// Test 2, tasks file has data
|
||||
fos.write("1234".getBytes());
|
||||
fos.close();
|
||||
// tasks has data, would not be able to delete, should return false
|
||||
Assert.assertFalse(handler.checkAndDeleteCgroup(fspy));
|
||||
FileUtils.deleteQuietly(cgroupDir);
|
||||
|
||||
}
|
||||
|
||||
// Verify DeleteCgroup times out if "tasks" file contains data
|
||||
@Test
|
||||
public void testDeleteCgroup() throws Exception {
|
||||
final MockClock clock = new MockClock();
|
||||
@ -52,12 +100,14 @@ public void testDeleteCgroup() throws Exception {
|
||||
handler.initConfig();
|
||||
handler.clock = clock;
|
||||
|
||||
//file exists
|
||||
File file = new File("target", UUID.randomUUID().toString());
|
||||
new FileOutputStream(file).close();
|
||||
Assert.assertTrue(handler.deleteCgroup(file.getPath()));
|
||||
FileUtils.deleteQuietly(cgroupDir);
|
||||
|
||||
// Create a non-empty tasks file
|
||||
File tfile = new File(cgroupDir.getAbsolutePath(), "tasks");
|
||||
FileOutputStream fos = FileUtils.openOutputStream(tfile);
|
||||
fos.write("1234".getBytes());
|
||||
fos.close();
|
||||
|
||||
//file does not exists, timing out
|
||||
final CountDownLatch latch = new CountDownLatch(1);
|
||||
new Thread() {
|
||||
@Override
|
||||
@ -73,8 +123,8 @@ public void run() {
|
||||
}
|
||||
}.start();
|
||||
latch.await();
|
||||
file = new File("target", UUID.randomUUID().toString());
|
||||
Assert.assertFalse(handler.deleteCgroup(file.getPath()));
|
||||
Assert.assertFalse(handler.deleteCgroup(cgroupDir.getAbsolutePath()));
|
||||
FileUtils.deleteQuietly(cgroupDir);
|
||||
}
|
||||
|
||||
static class MockLinuxContainerExecutor extends LinuxContainerExecutor {
|
||||
@ -122,7 +172,6 @@ public void testInit() throws IOException {
|
||||
handler.initConfig();
|
||||
|
||||
// create mock cgroup
|
||||
File cgroupDir = createMockCgroup();
|
||||
File cgroupMountDir = createMockCgroupMount(cgroupDir);
|
||||
|
||||
// create mock mtab
|
||||
@ -202,18 +251,10 @@ public void testGetOverallLimits() {
|
||||
Assert.assertEquals(-1, ret[1]);
|
||||
}
|
||||
|
||||
private File createMockCgroup() throws IOException {
|
||||
File cgroupDir = new File("target", UUID.randomUUID().toString());
|
||||
if (!cgroupDir.mkdir()) {
|
||||
String message = "Could not create dir " + cgroupDir.getAbsolutePath();
|
||||
throw new IOException(message);
|
||||
}
|
||||
return cgroupDir;
|
||||
}
|
||||
|
||||
private File createMockCgroupMount(File cgroupDir) throws IOException {
|
||||
File cgroupMountDir = new File(cgroupDir.getAbsolutePath(), "hadoop-yarn");
|
||||
if (!cgroupMountDir.mkdir()) {
|
||||
FileUtils.deleteQuietly(cgroupDir);
|
||||
if (!cgroupMountDir.mkdirs()) {
|
||||
String message =
|
||||
"Could not create dir " + cgroupMountDir.getAbsolutePath();
|
||||
throw new IOException(message);
|
||||
@ -253,7 +294,6 @@ public void testContainerLimits() throws IOException {
|
||||
handler.initConfig();
|
||||
|
||||
// create mock cgroup
|
||||
File cgroupDir = createMockCgroup();
|
||||
File cgroupMountDir = createMockCgroupMount(cgroupDir);
|
||||
|
||||
// create mock mtab
|
||||
|
Loading…
Reference in New Issue
Block a user