YARN-2809. Implement workaround for linux kernel panic when removing cgroup. Contributed by Nathan Roberts
This commit is contained in:
parent
4eb5f7fa32
commit
3f5431a22f
@ -530,6 +530,9 @@ Release 2.7.0 - UNRELEASED
|
|||||||
YARN-3090. DeletionService can silently ignore deletion task failures
|
YARN-3090. DeletionService can silently ignore deletion task failures
|
||||||
(Varun Saxena via jlowe)
|
(Varun Saxena via jlowe)
|
||||||
|
|
||||||
|
YARN-2809. Implement workaround for linux kernel panic when removing
|
||||||
|
cgroup (Nathan Roberts via jlowe)
|
||||||
|
|
||||||
Release 2.6.0 - 2014-11-18
|
Release 2.6.0 - 2014-11-18
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
@ -1003,6 +1003,15 @@ private static void addDeprecatedKeys() {
|
|||||||
public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT =
|
public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT =
|
||||||
1000;
|
1000;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delay between attempts to remove linux cgroup.
|
||||||
|
*/
|
||||||
|
public static final String NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY =
|
||||||
|
NM_PREFIX + "linux-container-executor.cgroups.delete-delay-ms";
|
||||||
|
|
||||||
|
public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY =
|
||||||
|
20;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
/* The Windows group that the windows-secure-container-executor should run as.
|
/* The Windows group that the windows-secure-container-executor should run as.
|
||||||
*/
|
*/
|
||||||
|
@ -22,6 +22,7 @@
|
|||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.OutputStreamWriter;
|
import java.io.OutputStreamWriter;
|
||||||
@ -37,6 +38,7 @@
|
|||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
@ -75,6 +77,7 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
|||||||
private final Map<String, String> controllerPaths; // Controller -> path
|
private final Map<String, String> controllerPaths; // Controller -> path
|
||||||
|
|
||||||
private long deleteCgroupTimeout;
|
private long deleteCgroupTimeout;
|
||||||
|
private long deleteCgroupDelay;
|
||||||
// package private for testing purposes
|
// package private for testing purposes
|
||||||
Clock clock;
|
Clock clock;
|
||||||
|
|
||||||
@ -108,6 +111,9 @@ void initConfig() throws IOException {
|
|||||||
this.deleteCgroupTimeout = conf.getLong(
|
this.deleteCgroupTimeout = conf.getLong(
|
||||||
YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT,
|
YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT,
|
||||||
YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT);
|
YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT);
|
||||||
|
this.deleteCgroupDelay =
|
||||||
|
conf.getLong(YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY,
|
||||||
|
YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY);
|
||||||
// remove extra /'s at end or start of cgroupPrefix
|
// remove extra /'s at end or start of cgroupPrefix
|
||||||
if (cgroupPrefix.charAt(0) == '/') {
|
if (cgroupPrefix.charAt(0) == '/') {
|
||||||
cgroupPrefix = cgroupPrefix.substring(1);
|
cgroupPrefix = cgroupPrefix.substring(1);
|
||||||
@ -271,23 +277,71 @@ private void updateCgroup(String controller, String groupName, String param,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Utility routine to print first line from cgroup tasks file
|
||||||
|
*/
|
||||||
|
private void logLineFromTasksFile(File cgf) {
|
||||||
|
String str;
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
try (BufferedReader inl =
|
||||||
|
new BufferedReader(new InputStreamReader(new FileInputStream(cgf
|
||||||
|
+ "/tasks"), "UTF-8"))) {
|
||||||
|
if ((str = inl.readLine()) != null) {
|
||||||
|
LOG.debug("First line in cgroup tasks file: " + cgf + " " + str);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
LOG.warn("Failed to read cgroup tasks file. ", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If tasks file is empty, delete the cgroup.
|
||||||
|
*
|
||||||
|
* @param file object referring to the cgroup to be deleted
|
||||||
|
* @return Boolean indicating whether cgroup was deleted
|
||||||
|
*/
|
||||||
|
@VisibleForTesting
|
||||||
|
boolean checkAndDeleteCgroup(File cgf) throws InterruptedException {
|
||||||
|
boolean deleted = false;
|
||||||
|
// FileInputStream in = null;
|
||||||
|
try (FileInputStream in = new FileInputStream(cgf + "/tasks")) {
|
||||||
|
if (in.read() == -1) {
|
||||||
|
/*
|
||||||
|
* "tasks" file is empty, sleep a bit more and then try to delete the
|
||||||
|
* cgroup. Some versions of linux will occasionally panic due to a race
|
||||||
|
* condition in this area, hence the paranoia.
|
||||||
|
*/
|
||||||
|
Thread.sleep(deleteCgroupDelay);
|
||||||
|
deleted = cgf.delete();
|
||||||
|
if (!deleted) {
|
||||||
|
LOG.warn("Failed attempt to delete cgroup: " + cgf);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logLineFromTasksFile(cgf);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
LOG.warn("Failed to read cgroup tasks file. ", e);
|
||||||
|
}
|
||||||
|
return deleted;
|
||||||
|
}
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
boolean deleteCgroup(String cgroupPath) {
|
boolean deleteCgroup(String cgroupPath) {
|
||||||
boolean deleted;
|
boolean deleted = false;
|
||||||
|
|
||||||
if (LOG.isDebugEnabled()) {
|
if (LOG.isDebugEnabled()) {
|
||||||
LOG.debug("deleteCgroup: " + cgroupPath);
|
LOG.debug("deleteCgroup: " + cgroupPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
long start = clock.getTime();
|
long start = clock.getTime();
|
||||||
do {
|
do {
|
||||||
deleted = new File(cgroupPath).delete();
|
try {
|
||||||
if (!deleted) {
|
deleted = checkAndDeleteCgroup(new File(cgroupPath));
|
||||||
try {
|
if (!deleted) {
|
||||||
Thread.sleep(20);
|
Thread.sleep(deleteCgroupDelay);
|
||||||
} catch (InterruptedException ex) {
|
|
||||||
// NOP
|
|
||||||
}
|
}
|
||||||
|
} catch (InterruptedException ex) {
|
||||||
|
// NOP
|
||||||
}
|
}
|
||||||
} while (!deleted && (clock.getTime() - start) < deleteCgroupTimeout);
|
} while (!deleted && (clock.getTime() - start) < deleteCgroupTimeout);
|
||||||
|
|
||||||
@ -295,7 +349,6 @@ boolean deleteCgroup(String cgroupPath) {
|
|||||||
LOG.warn("Unable to delete cgroup at: " + cgroupPath +
|
LOG.warn("Unable to delete cgroup at: " + cgroupPath +
|
||||||
", tried to delete for " + deleteCgroupTimeout + "ms");
|
", tried to delete for " + deleteCgroupTimeout + "ms");
|
||||||
}
|
}
|
||||||
|
|
||||||
return deleted;
|
return deleted;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,6 +26,8 @@
|
|||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.util.Clock;
|
import org.apache.hadoop.yarn.util.Clock;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Before;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
@ -35,6 +37,7 @@
|
|||||||
import java.util.concurrent.CountDownLatch;
|
import java.util.concurrent.CountDownLatch;
|
||||||
|
|
||||||
public class TestCgroupsLCEResourcesHandler {
|
public class TestCgroupsLCEResourcesHandler {
|
||||||
|
static File cgroupDir = null;
|
||||||
|
|
||||||
static class MockClock implements Clock {
|
static class MockClock implements Clock {
|
||||||
long time;
|
long time;
|
||||||
@ -43,6 +46,51 @@ public long getTime() {
|
|||||||
return time;
|
return time;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
cgroupDir =
|
||||||
|
new File(System.getProperty("test.build.data",
|
||||||
|
System.getProperty("java.io.tmpdir", "target")), this.getClass()
|
||||||
|
.getName());
|
||||||
|
FileUtils.deleteQuietly(cgroupDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
FileUtils.deleteQuietly(cgroupDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testcheckAndDeleteCgroup() throws Exception {
|
||||||
|
CgroupsLCEResourcesHandler handler = new CgroupsLCEResourcesHandler();
|
||||||
|
handler.setConf(new YarnConfiguration());
|
||||||
|
handler.initConfig();
|
||||||
|
|
||||||
|
FileUtils.deleteQuietly(cgroupDir);
|
||||||
|
// Test 0
|
||||||
|
// tasks file not present, should return false
|
||||||
|
Assert.assertFalse(handler.checkAndDeleteCgroup(cgroupDir));
|
||||||
|
|
||||||
|
File tfile = new File(cgroupDir.getAbsolutePath(), "tasks");
|
||||||
|
FileOutputStream fos = FileUtils.openOutputStream(tfile);
|
||||||
|
File fspy = Mockito.spy(cgroupDir);
|
||||||
|
|
||||||
|
// Test 1, tasks file is empty
|
||||||
|
// tasks file has no data, should return true
|
||||||
|
Mockito.stub(fspy.delete()).toReturn(true);
|
||||||
|
Assert.assertTrue(handler.checkAndDeleteCgroup(fspy));
|
||||||
|
|
||||||
|
// Test 2, tasks file has data
|
||||||
|
fos.write("1234".getBytes());
|
||||||
|
fos.close();
|
||||||
|
// tasks has data, would not be able to delete, should return false
|
||||||
|
Assert.assertFalse(handler.checkAndDeleteCgroup(fspy));
|
||||||
|
FileUtils.deleteQuietly(cgroupDir);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify DeleteCgroup times out if "tasks" file contains data
|
||||||
@Test
|
@Test
|
||||||
public void testDeleteCgroup() throws Exception {
|
public void testDeleteCgroup() throws Exception {
|
||||||
final MockClock clock = new MockClock();
|
final MockClock clock = new MockClock();
|
||||||
@ -52,12 +100,14 @@ public void testDeleteCgroup() throws Exception {
|
|||||||
handler.initConfig();
|
handler.initConfig();
|
||||||
handler.clock = clock;
|
handler.clock = clock;
|
||||||
|
|
||||||
//file exists
|
FileUtils.deleteQuietly(cgroupDir);
|
||||||
File file = new File("target", UUID.randomUUID().toString());
|
|
||||||
new FileOutputStream(file).close();
|
// Create a non-empty tasks file
|
||||||
Assert.assertTrue(handler.deleteCgroup(file.getPath()));
|
File tfile = new File(cgroupDir.getAbsolutePath(), "tasks");
|
||||||
|
FileOutputStream fos = FileUtils.openOutputStream(tfile);
|
||||||
|
fos.write("1234".getBytes());
|
||||||
|
fos.close();
|
||||||
|
|
||||||
//file does not exists, timing out
|
|
||||||
final CountDownLatch latch = new CountDownLatch(1);
|
final CountDownLatch latch = new CountDownLatch(1);
|
||||||
new Thread() {
|
new Thread() {
|
||||||
@Override
|
@Override
|
||||||
@ -73,8 +123,8 @@ public void run() {
|
|||||||
}
|
}
|
||||||
}.start();
|
}.start();
|
||||||
latch.await();
|
latch.await();
|
||||||
file = new File("target", UUID.randomUUID().toString());
|
Assert.assertFalse(handler.deleteCgroup(cgroupDir.getAbsolutePath()));
|
||||||
Assert.assertFalse(handler.deleteCgroup(file.getPath()));
|
FileUtils.deleteQuietly(cgroupDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
static class MockLinuxContainerExecutor extends LinuxContainerExecutor {
|
static class MockLinuxContainerExecutor extends LinuxContainerExecutor {
|
||||||
@ -122,7 +172,6 @@ public void testInit() throws IOException {
|
|||||||
handler.initConfig();
|
handler.initConfig();
|
||||||
|
|
||||||
// create mock cgroup
|
// create mock cgroup
|
||||||
File cgroupDir = createMockCgroup();
|
|
||||||
File cgroupMountDir = createMockCgroupMount(cgroupDir);
|
File cgroupMountDir = createMockCgroupMount(cgroupDir);
|
||||||
|
|
||||||
// create mock mtab
|
// create mock mtab
|
||||||
@ -202,18 +251,10 @@ public void testGetOverallLimits() {
|
|||||||
Assert.assertEquals(-1, ret[1]);
|
Assert.assertEquals(-1, ret[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
private File createMockCgroup() throws IOException {
|
|
||||||
File cgroupDir = new File("target", UUID.randomUUID().toString());
|
|
||||||
if (!cgroupDir.mkdir()) {
|
|
||||||
String message = "Could not create dir " + cgroupDir.getAbsolutePath();
|
|
||||||
throw new IOException(message);
|
|
||||||
}
|
|
||||||
return cgroupDir;
|
|
||||||
}
|
|
||||||
|
|
||||||
private File createMockCgroupMount(File cgroupDir) throws IOException {
|
private File createMockCgroupMount(File cgroupDir) throws IOException {
|
||||||
File cgroupMountDir = new File(cgroupDir.getAbsolutePath(), "hadoop-yarn");
|
File cgroupMountDir = new File(cgroupDir.getAbsolutePath(), "hadoop-yarn");
|
||||||
if (!cgroupMountDir.mkdir()) {
|
FileUtils.deleteQuietly(cgroupDir);
|
||||||
|
if (!cgroupMountDir.mkdirs()) {
|
||||||
String message =
|
String message =
|
||||||
"Could not create dir " + cgroupMountDir.getAbsolutePath();
|
"Could not create dir " + cgroupMountDir.getAbsolutePath();
|
||||||
throw new IOException(message);
|
throw new IOException(message);
|
||||||
@ -253,7 +294,6 @@ public void testContainerLimits() throws IOException {
|
|||||||
handler.initConfig();
|
handler.initConfig();
|
||||||
|
|
||||||
// create mock cgroup
|
// create mock cgroup
|
||||||
File cgroupDir = createMockCgroup();
|
|
||||||
File cgroupMountDir = createMockCgroupMount(cgroupDir);
|
File cgroupMountDir = createMockCgroupMount(cgroupDir);
|
||||||
|
|
||||||
// create mock mtab
|
// create mock mtab
|
||||||
|
Loading…
Reference in New Issue
Block a user