YARN-2809. Implement workaround for linux kernel panic when removing cgroup. Contributed by Nathan Roberts

This commit is contained in:
Jason Lowe 2015-02-10 17:27:21 +00:00
parent 4eb5f7fa32
commit 3f5431a22f
4 changed files with 136 additions and 31 deletions

View File

@ -530,6 +530,9 @@ Release 2.7.0 - UNRELEASED
YARN-3090. DeletionService can silently ignore deletion task failures
(Varun Saxena via jlowe)
YARN-2809. Implement workaround for linux kernel panic when removing
cgroup (Nathan Roberts via jlowe)
Release 2.6.0 - 2014-11-18
INCOMPATIBLE CHANGES

View File

@ -1003,6 +1003,15 @@ private static void addDeprecatedKeys() {
public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT =
1000;
/**
* Delay between attempts to remove linux cgroup.
*/
public static final String NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY =
NM_PREFIX + "linux-container-executor.cgroups.delete-delay-ms";
public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY =
20;
/**
/* The Windows group that the windows-secure-container-executor should run as.
*/

View File

@ -22,6 +22,7 @@
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
@ -37,6 +38,7 @@
import java.util.regex.Pattern;
import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -75,6 +77,7 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
private final Map<String, String> controllerPaths; // Controller -> path
private long deleteCgroupTimeout;
private long deleteCgroupDelay;
// package private for testing purposes
Clock clock;
@ -108,6 +111,9 @@ void initConfig() throws IOException {
this.deleteCgroupTimeout = conf.getLong(
YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT,
YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT);
this.deleteCgroupDelay =
conf.getLong(YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY,
YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY);
// remove extra /'s at end or start of cgroupPrefix
if (cgroupPrefix.charAt(0) == '/') {
cgroupPrefix = cgroupPrefix.substring(1);
@ -271,31 +277,78 @@ private void updateCgroup(String controller, String groupName, String param,
}
}
/*
* Utility routine to print first line from cgroup tasks file
*/
private void logLineFromTasksFile(File cgf) {
String str;
if (LOG.isDebugEnabled()) {
try (BufferedReader inl =
new BufferedReader(new InputStreamReader(new FileInputStream(cgf
+ "/tasks"), "UTF-8"))) {
if ((str = inl.readLine()) != null) {
LOG.debug("First line in cgroup tasks file: " + cgf + " " + str);
}
} catch (IOException e) {
LOG.warn("Failed to read cgroup tasks file. ", e);
}
}
}
/**
* If tasks file is empty, delete the cgroup.
*
* @param file object referring to the cgroup to be deleted
* @return Boolean indicating whether cgroup was deleted
*/
@VisibleForTesting
boolean checkAndDeleteCgroup(File cgf) throws InterruptedException {
boolean deleted = false;
// FileInputStream in = null;
try (FileInputStream in = new FileInputStream(cgf + "/tasks")) {
if (in.read() == -1) {
/*
* "tasks" file is empty, sleep a bit more and then try to delete the
* cgroup. Some versions of linux will occasionally panic due to a race
* condition in this area, hence the paranoia.
*/
Thread.sleep(deleteCgroupDelay);
deleted = cgf.delete();
if (!deleted) {
LOG.warn("Failed attempt to delete cgroup: " + cgf);
}
} else {
logLineFromTasksFile(cgf);
}
} catch (IOException e) {
LOG.warn("Failed to read cgroup tasks file. ", e);
}
return deleted;
}
@VisibleForTesting
boolean deleteCgroup(String cgroupPath) {
boolean deleted;
boolean deleted = false;
if (LOG.isDebugEnabled()) {
LOG.debug("deleteCgroup: " + cgroupPath);
}
long start = clock.getTime();
do {
deleted = new File(cgroupPath).delete();
if (!deleted) {
try {
Thread.sleep(20);
deleted = checkAndDeleteCgroup(new File(cgroupPath));
if (!deleted) {
Thread.sleep(deleteCgroupDelay);
}
} catch (InterruptedException ex) {
// NOP
}
}
} while (!deleted && (clock.getTime() - start) < deleteCgroupTimeout);
if (!deleted) {
LOG.warn("Unable to delete cgroup at: " + cgroupPath +
", tried to delete for " + deleteCgroupTimeout + "ms");
}
return deleted;
}

View File

@ -26,6 +26,8 @@
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.util.Clock;
import org.junit.Test;
import org.junit.After;
import org.junit.Before;
import org.mockito.Mockito;
import java.io.*;
@ -35,6 +37,7 @@
import java.util.concurrent.CountDownLatch;
public class TestCgroupsLCEResourcesHandler {
static File cgroupDir = null;
static class MockClock implements Clock {
long time;
@ -43,6 +46,51 @@ public long getTime() {
return time;
}
}
@Before
public void setUp() throws Exception {
cgroupDir =
new File(System.getProperty("test.build.data",
System.getProperty("java.io.tmpdir", "target")), this.getClass()
.getName());
FileUtils.deleteQuietly(cgroupDir);
}
@After
public void tearDown() throws Exception {
FileUtils.deleteQuietly(cgroupDir);
}
@Test
public void testcheckAndDeleteCgroup() throws Exception {
CgroupsLCEResourcesHandler handler = new CgroupsLCEResourcesHandler();
handler.setConf(new YarnConfiguration());
handler.initConfig();
FileUtils.deleteQuietly(cgroupDir);
// Test 0
// tasks file not present, should return false
Assert.assertFalse(handler.checkAndDeleteCgroup(cgroupDir));
File tfile = new File(cgroupDir.getAbsolutePath(), "tasks");
FileOutputStream fos = FileUtils.openOutputStream(tfile);
File fspy = Mockito.spy(cgroupDir);
// Test 1, tasks file is empty
// tasks file has no data, should return true
Mockito.stub(fspy.delete()).toReturn(true);
Assert.assertTrue(handler.checkAndDeleteCgroup(fspy));
// Test 2, tasks file has data
fos.write("1234".getBytes());
fos.close();
// tasks has data, would not be able to delete, should return false
Assert.assertFalse(handler.checkAndDeleteCgroup(fspy));
FileUtils.deleteQuietly(cgroupDir);
}
// Verify DeleteCgroup times out if "tasks" file contains data
@Test
public void testDeleteCgroup() throws Exception {
final MockClock clock = new MockClock();
@ -52,12 +100,14 @@ public void testDeleteCgroup() throws Exception {
handler.initConfig();
handler.clock = clock;
//file exists
File file = new File("target", UUID.randomUUID().toString());
new FileOutputStream(file).close();
Assert.assertTrue(handler.deleteCgroup(file.getPath()));
FileUtils.deleteQuietly(cgroupDir);
// Create a non-empty tasks file
File tfile = new File(cgroupDir.getAbsolutePath(), "tasks");
FileOutputStream fos = FileUtils.openOutputStream(tfile);
fos.write("1234".getBytes());
fos.close();
//file does not exists, timing out
final CountDownLatch latch = new CountDownLatch(1);
new Thread() {
@Override
@ -73,8 +123,8 @@ public void run() {
}
}.start();
latch.await();
file = new File("target", UUID.randomUUID().toString());
Assert.assertFalse(handler.deleteCgroup(file.getPath()));
Assert.assertFalse(handler.deleteCgroup(cgroupDir.getAbsolutePath()));
FileUtils.deleteQuietly(cgroupDir);
}
static class MockLinuxContainerExecutor extends LinuxContainerExecutor {
@ -122,7 +172,6 @@ public void testInit() throws IOException {
handler.initConfig();
// create mock cgroup
File cgroupDir = createMockCgroup();
File cgroupMountDir = createMockCgroupMount(cgroupDir);
// create mock mtab
@ -202,18 +251,10 @@ public void testGetOverallLimits() {
Assert.assertEquals(-1, ret[1]);
}
private File createMockCgroup() throws IOException {
File cgroupDir = new File("target", UUID.randomUUID().toString());
if (!cgroupDir.mkdir()) {
String message = "Could not create dir " + cgroupDir.getAbsolutePath();
throw new IOException(message);
}
return cgroupDir;
}
private File createMockCgroupMount(File cgroupDir) throws IOException {
File cgroupMountDir = new File(cgroupDir.getAbsolutePath(), "hadoop-yarn");
if (!cgroupMountDir.mkdir()) {
FileUtils.deleteQuietly(cgroupDir);
if (!cgroupMountDir.mkdirs()) {
String message =
"Could not create dir " + cgroupMountDir.getAbsolutePath();
throw new IOException(message);
@ -253,7 +294,6 @@ public void testContainerLimits() throws IOException {
handler.initConfig();
// create mock cgroup
File cgroupDir = createMockCgroup();
File cgroupMountDir = createMockCgroupMount(cgroupDir);
// create mock mtab