YARN-3122. Metrics for container's actual CPU usage. (Anubhav Dhoot via kasha)
This commit is contained in:
parent
722b479469
commit
53947f37c7
@ -351,6 +351,9 @@ Release 2.7.0 - UNRELEASED
|
|||||||
YARN-3272. Surface container locality info in RM web UI.
|
YARN-3272. Surface container locality info in RM web UI.
|
||||||
(Jian He via wangda)
|
(Jian He via wangda)
|
||||||
|
|
||||||
|
YARN-3122. Metrics for container's actual CPU usage.
|
||||||
|
(Anubhav Dhoot via kasha)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
YARN-2990. FairScheduler's delay-scheduling always waits for node-local and
|
YARN-2990. FairScheduler's delay-scheduling always waits for node-local and
|
||||||
|
@ -0,0 +1,99 @@
|
|||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.util;
|
||||||
|
|
||||||
|
import org.apache.hadoop.classification.InterfaceAudience;
|
||||||
|
import org.apache.hadoop.classification.InterfaceStability;
|
||||||
|
|
||||||
|
import java.math.BigInteger;
|
||||||
|
|
||||||
|
@InterfaceAudience.Private
|
||||||
|
@InterfaceStability.Unstable
|
||||||
|
public class CpuTimeTracker {
|
||||||
|
public static final int UNAVAILABLE = -1;
|
||||||
|
final long MINIMUM_UPDATE_INTERVAL;
|
||||||
|
|
||||||
|
// CPU used time since system is on (ms)
|
||||||
|
BigInteger cumulativeCpuTime = BigInteger.ZERO;
|
||||||
|
|
||||||
|
// CPU used time read last time (ms)
|
||||||
|
BigInteger lastCumulativeCpuTime = BigInteger.ZERO;
|
||||||
|
|
||||||
|
// Unix timestamp while reading the CPU time (ms)
|
||||||
|
long sampleTime;
|
||||||
|
long lastSampleTime;
|
||||||
|
float cpuUsage;
|
||||||
|
BigInteger jiffyLengthInMillis;
|
||||||
|
|
||||||
|
public CpuTimeTracker(long jiffyLengthInMillis) {
|
||||||
|
this.jiffyLengthInMillis = BigInteger.valueOf(jiffyLengthInMillis);
|
||||||
|
this.cpuUsage = UNAVAILABLE;
|
||||||
|
this.sampleTime = UNAVAILABLE;
|
||||||
|
this.lastSampleTime = UNAVAILABLE;
|
||||||
|
MINIMUM_UPDATE_INTERVAL = 10 * jiffyLengthInMillis;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return percentage of cpu time spent over the time since last update.
|
||||||
|
* CPU time spent is based on elapsed jiffies multiplied by amount of
|
||||||
|
* time for 1 core. Thus, if you use 2 cores completely you would have spent
|
||||||
|
* twice the actual time between updates and this will return 200%.
|
||||||
|
*
|
||||||
|
* @return Return percentage of cpu usage since last update, {@link
|
||||||
|
* CpuTimeTracker#UNAVAILABLE} if there haven't been 2 updates more than
|
||||||
|
* {@link CpuTimeTracker#MINIMUM_UPDATE_INTERVAL} apart
|
||||||
|
*/
|
||||||
|
public float getCpuTrackerUsagePercent() {
|
||||||
|
if (lastSampleTime == UNAVAILABLE ||
|
||||||
|
lastSampleTime > sampleTime) {
|
||||||
|
// lastSampleTime > sampleTime may happen when the system time is changed
|
||||||
|
lastSampleTime = sampleTime;
|
||||||
|
lastCumulativeCpuTime = cumulativeCpuTime;
|
||||||
|
return cpuUsage;
|
||||||
|
}
|
||||||
|
// When lastSampleTime is sufficiently old, update cpuUsage.
|
||||||
|
// Also take a sample of the current time and cumulative CPU time for the
|
||||||
|
// use of the next calculation.
|
||||||
|
if (sampleTime > lastSampleTime + MINIMUM_UPDATE_INTERVAL) {
|
||||||
|
cpuUsage =
|
||||||
|
((cumulativeCpuTime.subtract(lastCumulativeCpuTime)).floatValue())
|
||||||
|
* 100F / ((float) (sampleTime - lastSampleTime));
|
||||||
|
lastSampleTime = sampleTime;
|
||||||
|
lastCumulativeCpuTime = cumulativeCpuTime;
|
||||||
|
}
|
||||||
|
return cpuUsage;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void updateElapsedJiffies(BigInteger elapedJiffies, long sampleTime) {
|
||||||
|
this.cumulativeCpuTime = elapedJiffies.multiply(jiffyLengthInMillis);
|
||||||
|
this.sampleTime = sampleTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append("SampleTime " + this.sampleTime);
|
||||||
|
sb.append(" CummulativeCpuTime " + this.cumulativeCpuTime);
|
||||||
|
sb.append(" LastSampleTime " + this.lastSampleTime);
|
||||||
|
sb.append(" LastCummulativeCpuTime " + this.lastCumulativeCpuTime);
|
||||||
|
sb.append(" CpuUsage " + this.cpuUsage);
|
||||||
|
sb.append(" JiffyLengthMillisec " + this.jiffyLengthInMillis);
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
}
|
@ -23,6 +23,7 @@
|
|||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.math.BigInteger;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
@ -41,8 +42,6 @@ public class LinuxResourceCalculatorPlugin extends ResourceCalculatorPlugin {
|
|||||||
private static final Log LOG =
|
private static final Log LOG =
|
||||||
LogFactory.getLog(LinuxResourceCalculatorPlugin.class);
|
LogFactory.getLog(LinuxResourceCalculatorPlugin.class);
|
||||||
|
|
||||||
public static final int UNAVAILABLE = -1;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* proc's meminfo virtual file has keys-values in the format
|
* proc's meminfo virtual file has keys-values in the format
|
||||||
* "key:[ \t]*value[ \t]kB".
|
* "key:[ \t]*value[ \t]kB".
|
||||||
@ -74,6 +73,7 @@ public class LinuxResourceCalculatorPlugin extends ResourceCalculatorPlugin {
|
|||||||
private static final Pattern CPU_TIME_FORMAT =
|
private static final Pattern CPU_TIME_FORMAT =
|
||||||
Pattern.compile("^cpu[ \t]*([0-9]*)" +
|
Pattern.compile("^cpu[ \t]*([0-9]*)" +
|
||||||
"[ \t]*([0-9]*)[ \t]*([0-9]*)[ \t].*");
|
"[ \t]*([0-9]*)[ \t]*([0-9]*)[ \t].*");
|
||||||
|
private CpuTimeTracker cpuTimeTracker;
|
||||||
|
|
||||||
private String procfsMemFile;
|
private String procfsMemFile;
|
||||||
private String procfsCpuFile;
|
private String procfsCpuFile;
|
||||||
@ -87,12 +87,6 @@ public class LinuxResourceCalculatorPlugin extends ResourceCalculatorPlugin {
|
|||||||
private long inactiveSize = 0; // inactive cache memory (kB)
|
private long inactiveSize = 0; // inactive cache memory (kB)
|
||||||
private int numProcessors = 0; // number of processors on the system
|
private int numProcessors = 0; // number of processors on the system
|
||||||
private long cpuFrequency = 0L; // CPU frequency on the system (kHz)
|
private long cpuFrequency = 0L; // CPU frequency on the system (kHz)
|
||||||
private long cumulativeCpuTime = 0L; // CPU used time since system is on (ms)
|
|
||||||
private long lastCumulativeCpuTime = 0L; // CPU used time read last time (ms)
|
|
||||||
// Unix timestamp while reading the CPU time (ms)
|
|
||||||
private float cpuUsage = UNAVAILABLE;
|
|
||||||
private long sampleTime = UNAVAILABLE;
|
|
||||||
private long lastSampleTime = UNAVAILABLE;
|
|
||||||
|
|
||||||
boolean readMemInfoFile = false;
|
boolean readMemInfoFile = false;
|
||||||
boolean readCpuInfoFile = false;
|
boolean readCpuInfoFile = false;
|
||||||
@ -106,10 +100,8 @@ long getCurrentTime() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public LinuxResourceCalculatorPlugin() {
|
public LinuxResourceCalculatorPlugin() {
|
||||||
procfsMemFile = PROCFS_MEMFILE;
|
this(PROCFS_MEMFILE, PROCFS_CPUINFO, PROCFS_STAT,
|
||||||
procfsCpuFile = PROCFS_CPUINFO;
|
ProcfsBasedProcessTree.JIFFY_LENGTH_IN_MILLIS);
|
||||||
procfsStatFile = PROCFS_STAT;
|
|
||||||
jiffyLengthInMillis = ProcfsBasedProcessTree.JIFFY_LENGTH_IN_MILLIS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -128,6 +120,7 @@ public LinuxResourceCalculatorPlugin(String procfsMemFile,
|
|||||||
this.procfsCpuFile = procfsCpuFile;
|
this.procfsCpuFile = procfsCpuFile;
|
||||||
this.procfsStatFile = procfsStatFile;
|
this.procfsStatFile = procfsStatFile;
|
||||||
this.jiffyLengthInMillis = jiffyLengthInMillis;
|
this.jiffyLengthInMillis = jiffyLengthInMillis;
|
||||||
|
this.cpuTimeTracker = new CpuTimeTracker(jiffyLengthInMillis);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -276,12 +269,13 @@ private void readProcStatFile() {
|
|||||||
long uTime = Long.parseLong(mat.group(1));
|
long uTime = Long.parseLong(mat.group(1));
|
||||||
long nTime = Long.parseLong(mat.group(2));
|
long nTime = Long.parseLong(mat.group(2));
|
||||||
long sTime = Long.parseLong(mat.group(3));
|
long sTime = Long.parseLong(mat.group(3));
|
||||||
cumulativeCpuTime = uTime + nTime + sTime; // milliseconds
|
cpuTimeTracker.updateElapsedJiffies(
|
||||||
|
BigInteger.valueOf(uTime + nTime + sTime),
|
||||||
|
getCurrentTime());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
str = in.readLine();
|
str = in.readLine();
|
||||||
}
|
}
|
||||||
cumulativeCpuTime *= jiffyLengthInMillis;
|
|
||||||
} catch (IOException io) {
|
} catch (IOException io) {
|
||||||
LOG.warn("Error reading the stream " + io);
|
LOG.warn("Error reading the stream " + io);
|
||||||
} finally {
|
} finally {
|
||||||
@ -345,32 +339,18 @@ public long getCpuFrequency() {
|
|||||||
@Override
|
@Override
|
||||||
public long getCumulativeCpuTime() {
|
public long getCumulativeCpuTime() {
|
||||||
readProcStatFile();
|
readProcStatFile();
|
||||||
return cumulativeCpuTime;
|
return cpuTimeTracker.cumulativeCpuTime.longValue();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** {@inheritDoc} */
|
/** {@inheritDoc} */
|
||||||
@Override
|
@Override
|
||||||
public float getCpuUsage() {
|
public float getCpuUsage() {
|
||||||
readProcStatFile();
|
readProcStatFile();
|
||||||
sampleTime = getCurrentTime();
|
float overallCpuUsage = cpuTimeTracker.getCpuTrackerUsagePercent();
|
||||||
if (lastSampleTime == UNAVAILABLE ||
|
if (overallCpuUsage != CpuTimeTracker.UNAVAILABLE) {
|
||||||
lastSampleTime > sampleTime) {
|
overallCpuUsage = overallCpuUsage / getNumProcessors();
|
||||||
// lastSampleTime > sampleTime may happen when the system time is changed
|
|
||||||
lastSampleTime = sampleTime;
|
|
||||||
lastCumulativeCpuTime = cumulativeCpuTime;
|
|
||||||
return cpuUsage;
|
|
||||||
}
|
}
|
||||||
// When lastSampleTime is sufficiently old, update cpuUsage.
|
return overallCpuUsage;
|
||||||
// Also take a sample of the current time and cumulative CPU time for the
|
|
||||||
// use of the next calculation.
|
|
||||||
final long MINIMUM_UPDATE_INTERVAL = 10 * jiffyLengthInMillis;
|
|
||||||
if (sampleTime > lastSampleTime + MINIMUM_UPDATE_INTERVAL) {
|
|
||||||
cpuUsage = (float)(cumulativeCpuTime - lastCumulativeCpuTime) * 100F /
|
|
||||||
((float)(sampleTime - lastSampleTime) * getNumProcessors());
|
|
||||||
lastSampleTime = sampleTime;
|
|
||||||
lastCumulativeCpuTime = cumulativeCpuTime;
|
|
||||||
}
|
|
||||||
return cpuUsage;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -66,6 +66,8 @@ public class ProcfsBasedProcessTree extends ResourceCalculatorProcessTree {
|
|||||||
public static final String PROCFS_CMDLINE_FILE = "cmdline";
|
public static final String PROCFS_CMDLINE_FILE = "cmdline";
|
||||||
public static final long PAGE_SIZE;
|
public static final long PAGE_SIZE;
|
||||||
public static final long JIFFY_LENGTH_IN_MILLIS; // in millisecond
|
public static final long JIFFY_LENGTH_IN_MILLIS; // in millisecond
|
||||||
|
private final CpuTimeTracker cpuTimeTracker;
|
||||||
|
private Clock clock;
|
||||||
|
|
||||||
enum MemInfo {
|
enum MemInfo {
|
||||||
SIZE("Size"), RSS("Rss"), PSS("Pss"), SHARED_CLEAN("Shared_Clean"),
|
SIZE("Size"), RSS("Rss"), PSS("Pss"), SHARED_CLEAN("Shared_Clean"),
|
||||||
@ -144,7 +146,7 @@ public static MemInfo getMemInfoByName(String name) {
|
|||||||
new HashMap<String, ProcessInfo>();
|
new HashMap<String, ProcessInfo>();
|
||||||
|
|
||||||
public ProcfsBasedProcessTree(String pid) {
|
public ProcfsBasedProcessTree(String pid) {
|
||||||
this(pid, PROCFS);
|
this(pid, PROCFS, new SystemClock());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -157,6 +159,10 @@ public void setConf(Configuration conf) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public ProcfsBasedProcessTree(String pid, String procfsDir) {
|
||||||
|
this(pid, procfsDir, new SystemClock());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Build a new process tree rooted at the pid.
|
* Build a new process tree rooted at the pid.
|
||||||
*
|
*
|
||||||
@ -165,11 +171,14 @@ public void setConf(Configuration conf) {
|
|||||||
*
|
*
|
||||||
* @param pid root of the process tree
|
* @param pid root of the process tree
|
||||||
* @param procfsDir the root of a proc file system - only used for testing.
|
* @param procfsDir the root of a proc file system - only used for testing.
|
||||||
|
* @param clock clock for controlling time for testing
|
||||||
*/
|
*/
|
||||||
public ProcfsBasedProcessTree(String pid, String procfsDir) {
|
public ProcfsBasedProcessTree(String pid, String procfsDir, Clock clock) {
|
||||||
super(pid);
|
super(pid);
|
||||||
|
this.clock = clock;
|
||||||
this.pid = getValidPID(pid);
|
this.pid = getValidPID(pid);
|
||||||
this.procfsDir = procfsDir;
|
this.procfsDir = procfsDir;
|
||||||
|
this.cpuTimeTracker = new CpuTimeTracker(JIFFY_LENGTH_IN_MILLIS);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -447,6 +456,26 @@ public long getCumulativeCpuTime() {
|
|||||||
return cpuTime;
|
return cpuTime;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private BigInteger getTotalProcessJiffies() {
|
||||||
|
BigInteger totalStime = BigInteger.ZERO;
|
||||||
|
long totalUtime = 0;
|
||||||
|
for (ProcessInfo p : processTree.values()) {
|
||||||
|
if (p != null) {
|
||||||
|
totalUtime += p.getUtime();
|
||||||
|
totalStime = totalStime.add(p.getStime());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return totalStime.add(BigInteger.valueOf(totalUtime));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float getCpuUsagePercent() {
|
||||||
|
BigInteger processTotalJiffies = getTotalProcessJiffies();
|
||||||
|
cpuTimeTracker.updateElapsedJiffies(processTotalJiffies,
|
||||||
|
clock.getTime());
|
||||||
|
return cpuTimeTracker.getCpuTrackerUsagePercent();
|
||||||
|
}
|
||||||
|
|
||||||
private static String getValidPID(String pid) {
|
private static String getValidPID(String pid) {
|
||||||
if (pid == null) return deadPid;
|
if (pid == null) return deadPid;
|
||||||
Matcher m = numberPattern.matcher(pid);
|
Matcher m = numberPattern.matcher(pid);
|
||||||
@ -962,4 +991,48 @@ public String toString() {
|
|||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the {@link ProcfsBasedProcessTree}
|
||||||
|
*
|
||||||
|
* @param args
|
||||||
|
*/
|
||||||
|
public static void main(String[] args) {
|
||||||
|
if (args.length != 1) {
|
||||||
|
System.out.println("Provide <pid of process to monitor>");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int numprocessors =
|
||||||
|
ResourceCalculatorPlugin.getResourceCalculatorPlugin(null, null)
|
||||||
|
.getNumProcessors();
|
||||||
|
System.out.println("Number of processors " + numprocessors);
|
||||||
|
|
||||||
|
System.out.println("Creating ProcfsBasedProcessTree for process " +
|
||||||
|
args[0]);
|
||||||
|
ProcfsBasedProcessTree procfsBasedProcessTree = new
|
||||||
|
ProcfsBasedProcessTree(args[0]);
|
||||||
|
procfsBasedProcessTree.updateProcessTree();
|
||||||
|
|
||||||
|
System.out.println(procfsBasedProcessTree.getProcessTreeDump());
|
||||||
|
System.out.println("Get cpu usage " + procfsBasedProcessTree
|
||||||
|
.getCpuUsagePercent());
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Sleep so we can compute the CPU usage
|
||||||
|
Thread.sleep(500L);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
|
|
||||||
|
procfsBasedProcessTree.updateProcessTree();
|
||||||
|
|
||||||
|
System.out.println(procfsBasedProcessTree.getProcessTreeDump());
|
||||||
|
System.out.println("Cpu usage " + procfsBasedProcessTree
|
||||||
|
.getCpuUsagePercent());
|
||||||
|
System.out.println("Vmem usage in bytes " + procfsBasedProcessTree
|
||||||
|
.getCumulativeVmem());
|
||||||
|
System.out.println("Rss mem usage in bytes " + procfsBasedProcessTree
|
||||||
|
.getCumulativeRssmem());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -108,13 +108,23 @@ public long getCumulativeRssmem() {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the CPU time in millisecond used by all the processes in the
|
* Get the CPU time in millisecond used by all the processes in the
|
||||||
* process-tree since the process-tree created
|
* process-tree since the process-tree was created
|
||||||
*
|
*
|
||||||
* @return cumulative CPU time in millisecond since the process-tree created
|
* @return cumulative CPU time in millisecond since the process-tree created
|
||||||
* return 0 if it cannot be calculated
|
* return 0 if it cannot be calculated
|
||||||
*/
|
*/
|
||||||
public abstract long getCumulativeCpuTime();
|
public abstract long getCumulativeCpuTime();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the CPU usage by all the processes in the process-tree based on
|
||||||
|
* average between samples as a ratio of overall CPU cycles similar to top.
|
||||||
|
* Thus, if 2 out of 4 cores are used this should return 200.0.
|
||||||
|
*
|
||||||
|
* @return percentage CPU usage since the process-tree was created
|
||||||
|
* return {@link CpuTimeTracker#UNAVAILABLE} if it cannot be calculated
|
||||||
|
*/
|
||||||
|
public abstract float getCpuUsagePercent();
|
||||||
|
|
||||||
/** Verify that the tree process id is same as its process group id.
|
/** Verify that the tree process id is same as its process group id.
|
||||||
* @return true if the process id matches else return false.
|
* @return true if the process id matches else return false.
|
||||||
*/
|
*/
|
||||||
|
@ -202,4 +202,9 @@ public long getCumulativeCpuTime() {
|
|||||||
return cpuTimeMs;
|
return cpuTimeMs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float getCpuUsagePercent() {
|
||||||
|
return CpuTimeTracker.UNAVAILABLE;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -171,7 +171,7 @@ public void parsingProcStatAndCpuFile() throws IOException {
|
|||||||
updateStatFile(uTime, nTime, sTime);
|
updateStatFile(uTime, nTime, sTime);
|
||||||
assertEquals(plugin.getCumulativeCpuTime(),
|
assertEquals(plugin.getCumulativeCpuTime(),
|
||||||
FAKE_JIFFY_LENGTH * (uTime + nTime + sTime));
|
FAKE_JIFFY_LENGTH * (uTime + nTime + sTime));
|
||||||
assertEquals(plugin.getCpuUsage(), (float)(LinuxResourceCalculatorPlugin.UNAVAILABLE),0.0);
|
assertEquals(plugin.getCpuUsage(), (float)(CpuTimeTracker.UNAVAILABLE),0.0);
|
||||||
|
|
||||||
// Advance the time and sample again to test the CPU usage calculation
|
// Advance the time and sample again to test the CPU usage calculation
|
||||||
uTime += 100L;
|
uTime += 100L;
|
||||||
|
@ -236,8 +236,8 @@ protected ProcfsBasedProcessTree createProcessTree(String pid) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
protected ProcfsBasedProcessTree createProcessTree(String pid,
|
protected ProcfsBasedProcessTree createProcessTree(String pid,
|
||||||
String procfsRootDir) {
|
String procfsRootDir, Clock clock) {
|
||||||
return new ProcfsBasedProcessTree(pid, procfsRootDir);
|
return new ProcfsBasedProcessTree(pid, procfsRootDir, clock);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void destroyProcessTree(String pid) throws IOException {
|
protected void destroyProcessTree(String pid) throws IOException {
|
||||||
@ -388,6 +388,8 @@ public void testCpuAndMemoryForProcessTree() throws IOException {
|
|||||||
|
|
||||||
// test processes
|
// test processes
|
||||||
String[] pids = { "100", "200", "300", "400" };
|
String[] pids = { "100", "200", "300", "400" };
|
||||||
|
ControlledClock testClock = new ControlledClock(new SystemClock());
|
||||||
|
testClock.setTime(0);
|
||||||
// create the fake procfs root directory.
|
// create the fake procfs root directory.
|
||||||
File procfsRootDir = new File(TEST_ROOT_DIR, "proc");
|
File procfsRootDir = new File(TEST_ROOT_DIR, "proc");
|
||||||
|
|
||||||
@ -422,7 +424,7 @@ public void testCpuAndMemoryForProcessTree() throws IOException {
|
|||||||
// crank up the process tree class.
|
// crank up the process tree class.
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
ProcfsBasedProcessTree processTree =
|
ProcfsBasedProcessTree processTree =
|
||||||
createProcessTree("100", procfsRootDir.getAbsolutePath());
|
createProcessTree("100", procfsRootDir.getAbsolutePath(), testClock);
|
||||||
processTree.setConf(conf);
|
processTree.setConf(conf);
|
||||||
// build the process tree.
|
// build the process tree.
|
||||||
processTree.updateProcessTree();
|
processTree.updateProcessTree();
|
||||||
@ -444,6 +446,12 @@ public void testCpuAndMemoryForProcessTree() throws IOException {
|
|||||||
? 7200L * ProcfsBasedProcessTree.JIFFY_LENGTH_IN_MILLIS : 0L;
|
? 7200L * ProcfsBasedProcessTree.JIFFY_LENGTH_IN_MILLIS : 0L;
|
||||||
Assert.assertEquals("Cumulative cpu time does not match", cumuCpuTime,
|
Assert.assertEquals("Cumulative cpu time does not match", cumuCpuTime,
|
||||||
processTree.getCumulativeCpuTime());
|
processTree.getCumulativeCpuTime());
|
||||||
|
|
||||||
|
// verify CPU usage
|
||||||
|
Assert.assertEquals("Percent CPU time should be set to -1 initially",
|
||||||
|
-1.0, processTree.getCpuUsagePercent(),
|
||||||
|
0.01);
|
||||||
|
|
||||||
// Check by enabling smaps
|
// Check by enabling smaps
|
||||||
setSmapsInProceTree(processTree, true);
|
setSmapsInProceTree(processTree, true);
|
||||||
// RSS=Min(shared_dirty,PSS)+PrivateClean+PrivateDirty (exclude r-xs,
|
// RSS=Min(shared_dirty,PSS)+PrivateClean+PrivateDirty (exclude r-xs,
|
||||||
@ -460,15 +468,31 @@ public void testCpuAndMemoryForProcessTree() throws IOException {
|
|||||||
"100", "200000", "200", "3000", "500" });
|
"100", "200000", "200", "3000", "500" });
|
||||||
writeStatFiles(procfsRootDir, pids, procInfos, memInfo);
|
writeStatFiles(procfsRootDir, pids, procInfos, memInfo);
|
||||||
|
|
||||||
|
long elapsedTimeBetweenUpdatesMsec = 200000;
|
||||||
|
testClock.setTime(elapsedTimeBetweenUpdatesMsec);
|
||||||
// build the process tree.
|
// build the process tree.
|
||||||
processTree.updateProcessTree();
|
processTree.updateProcessTree();
|
||||||
|
|
||||||
// verify cumulative cpu time again
|
// verify cumulative cpu time again
|
||||||
|
long prevCumuCpuTime = cumuCpuTime;
|
||||||
cumuCpuTime =
|
cumuCpuTime =
|
||||||
ProcfsBasedProcessTree.JIFFY_LENGTH_IN_MILLIS > 0
|
ProcfsBasedProcessTree.JIFFY_LENGTH_IN_MILLIS > 0
|
||||||
? 9400L * ProcfsBasedProcessTree.JIFFY_LENGTH_IN_MILLIS : 0L;
|
? 9400L * ProcfsBasedProcessTree.JIFFY_LENGTH_IN_MILLIS : 0L;
|
||||||
Assert.assertEquals("Cumulative cpu time does not match", cumuCpuTime,
|
Assert.assertEquals("Cumulative cpu time does not match", cumuCpuTime,
|
||||||
processTree.getCumulativeCpuTime());
|
processTree.getCumulativeCpuTime());
|
||||||
|
|
||||||
|
double expectedCpuUsagePercent =
|
||||||
|
(ProcfsBasedProcessTree.JIFFY_LENGTH_IN_MILLIS > 0) ?
|
||||||
|
(cumuCpuTime - prevCumuCpuTime) * 100.0 /
|
||||||
|
elapsedTimeBetweenUpdatesMsec : 0;
|
||||||
|
// expectedCpuUsagePercent is given by (94000L - 72000) * 100/
|
||||||
|
// 200000;
|
||||||
|
// which in this case is 11. Lets verify that first
|
||||||
|
Assert.assertEquals(11, expectedCpuUsagePercent, 0.001);
|
||||||
|
Assert.assertEquals("Percent CPU time is not correct expected " +
|
||||||
|
expectedCpuUsagePercent, expectedCpuUsagePercent,
|
||||||
|
processTree.getCpuUsagePercent(),
|
||||||
|
0.01);
|
||||||
} finally {
|
} finally {
|
||||||
FileUtil.fullyDelete(procfsRootDir);
|
FileUtil.fullyDelete(procfsRootDir);
|
||||||
}
|
}
|
||||||
@ -535,7 +559,8 @@ private void testMemForOlderProcesses(boolean smapEnabled) throws IOException {
|
|||||||
|
|
||||||
// crank up the process tree class.
|
// crank up the process tree class.
|
||||||
ProcfsBasedProcessTree processTree =
|
ProcfsBasedProcessTree processTree =
|
||||||
createProcessTree("100", procfsRootDir.getAbsolutePath());
|
createProcessTree("100", procfsRootDir.getAbsolutePath(),
|
||||||
|
new SystemClock());
|
||||||
setSmapsInProceTree(processTree, smapEnabled);
|
setSmapsInProceTree(processTree, smapEnabled);
|
||||||
|
|
||||||
// verify cumulative memory
|
// verify cumulative memory
|
||||||
@ -672,7 +697,7 @@ public void testDestroyProcessTree() throws IOException {
|
|||||||
setupProcfsRootDir(procfsRootDir);
|
setupProcfsRootDir(procfsRootDir);
|
||||||
|
|
||||||
// crank up the process tree class.
|
// crank up the process tree class.
|
||||||
createProcessTree(pid, procfsRootDir.getAbsolutePath());
|
createProcessTree(pid, procfsRootDir.getAbsolutePath(), new SystemClock());
|
||||||
|
|
||||||
// Let us not create stat file for pid 100.
|
// Let us not create stat file for pid 100.
|
||||||
Assert.assertTrue(ProcfsBasedProcessTree.checkPidPgrpidForMatch(pid,
|
Assert.assertTrue(ProcfsBasedProcessTree.checkPidPgrpidForMatch(pid,
|
||||||
@ -741,7 +766,8 @@ public void testProcessTreeDump() throws IOException {
|
|||||||
writeCmdLineFiles(procfsRootDir, pids, cmdLines);
|
writeCmdLineFiles(procfsRootDir, pids, cmdLines);
|
||||||
|
|
||||||
ProcfsBasedProcessTree processTree =
|
ProcfsBasedProcessTree processTree =
|
||||||
createProcessTree("100", procfsRootDir.getAbsolutePath());
|
createProcessTree("100", procfsRootDir.getAbsolutePath(),
|
||||||
|
new SystemClock());
|
||||||
// build the process tree.
|
// build the process tree.
|
||||||
processTree.updateProcessTree();
|
processTree.updateProcessTree();
|
||||||
|
|
||||||
|
@ -53,6 +53,11 @@ public long getCumulativeCpuTime() {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float getCpuUsagePercent() {
|
||||||
|
return CpuTimeTracker.UNAVAILABLE;
|
||||||
|
}
|
||||||
|
|
||||||
public boolean checkPidPgrpidForMatch() {
|
public boolean checkPidPgrpidForMatch() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -42,14 +42,29 @@
|
|||||||
@Metrics(context="container")
|
@Metrics(context="container")
|
||||||
public class ContainerMetrics implements MetricsSource {
|
public class ContainerMetrics implements MetricsSource {
|
||||||
|
|
||||||
public static final String PMEM_LIMIT_METRIC_NAME = "pMemLimit";
|
public static final String PMEM_LIMIT_METRIC_NAME = "pMemLimitMBs";
|
||||||
public static final String VMEM_LIMIT_METRIC_NAME = "vMemLimit";
|
public static final String VMEM_LIMIT_METRIC_NAME = "vMemLimitMBs";
|
||||||
public static final String VCORE_LIMIT_METRIC_NAME = "vCoreLimit";
|
public static final String VCORE_LIMIT_METRIC_NAME = "vCoreLimit";
|
||||||
public static final String PMEM_USAGE_METRIC_NAME = "pMemUsage";
|
public static final String PMEM_USAGE_METRIC_NAME = "pMemUsageMBs";
|
||||||
|
private static final String PHY_CPU_USAGE_METRIC_NAME = "pCpuUsagePercent";
|
||||||
|
|
||||||
|
// Use a multiplier of 1000 to avoid losing too much precision when
|
||||||
|
// converting to integers
|
||||||
|
private static final String VCORE_USAGE_METRIC_NAME = "milliVcoreUsage";
|
||||||
|
|
||||||
@Metric
|
@Metric
|
||||||
public MutableStat pMemMBsStat;
|
public MutableStat pMemMBsStat;
|
||||||
|
|
||||||
|
// This tracks overall CPU percentage of the machine in terms of percentage
|
||||||
|
// of 1 core similar to top
|
||||||
|
// Thus if you use 2 cores completely out of 4 available cores this value
|
||||||
|
// will be 200
|
||||||
|
@Metric
|
||||||
|
public MutableStat cpuCoreUsagePercent;
|
||||||
|
|
||||||
|
@Metric
|
||||||
|
public MutableStat milliVcoresUsed;
|
||||||
|
|
||||||
@Metric
|
@Metric
|
||||||
public MutableGaugeInt pMemLimitMbs;
|
public MutableGaugeInt pMemLimitMbs;
|
||||||
|
|
||||||
@ -57,7 +72,7 @@ public class ContainerMetrics implements MetricsSource {
|
|||||||
public MutableGaugeInt vMemLimitMbs;
|
public MutableGaugeInt vMemLimitMbs;
|
||||||
|
|
||||||
@Metric
|
@Metric
|
||||||
public MutableGaugeInt cpuVcores;
|
public MutableGaugeInt cpuVcoreLimit;
|
||||||
|
|
||||||
static final MetricsInfo RECORD_INFO =
|
static final MetricsInfo RECORD_INFO =
|
||||||
info("ContainerResource", "Resource limit and usage by container");
|
info("ContainerResource", "Resource limit and usage by container");
|
||||||
@ -95,11 +110,17 @@ public class ContainerMetrics implements MetricsSource {
|
|||||||
|
|
||||||
this.pMemMBsStat = registry.newStat(
|
this.pMemMBsStat = registry.newStat(
|
||||||
PMEM_USAGE_METRIC_NAME, "Physical memory stats", "Usage", "MBs", true);
|
PMEM_USAGE_METRIC_NAME, "Physical memory stats", "Usage", "MBs", true);
|
||||||
|
this.cpuCoreUsagePercent = registry.newStat(
|
||||||
|
PHY_CPU_USAGE_METRIC_NAME, "Physical Cpu core percent usage stats",
|
||||||
|
"Usage", "Percents", true);
|
||||||
|
this.milliVcoresUsed = registry.newStat(
|
||||||
|
VCORE_USAGE_METRIC_NAME, "1000 times Vcore usage", "Usage",
|
||||||
|
"MilliVcores", true);
|
||||||
this.pMemLimitMbs = registry.newGauge(
|
this.pMemLimitMbs = registry.newGauge(
|
||||||
PMEM_LIMIT_METRIC_NAME, "Physical memory limit in MBs", 0);
|
PMEM_LIMIT_METRIC_NAME, "Physical memory limit in MBs", 0);
|
||||||
this.vMemLimitMbs = registry.newGauge(
|
this.vMemLimitMbs = registry.newGauge(
|
||||||
VMEM_LIMIT_METRIC_NAME, "Virtual memory limit in MBs", 0);
|
VMEM_LIMIT_METRIC_NAME, "Virtual memory limit in MBs", 0);
|
||||||
this.cpuVcores = registry.newGauge(
|
this.cpuVcoreLimit = registry.newGauge(
|
||||||
VCORE_LIMIT_METRIC_NAME, "CPU limit in number of vcores", 0);
|
VCORE_LIMIT_METRIC_NAME, "CPU limit in number of vcores", 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -170,6 +191,12 @@ public void recordMemoryUsage(int memoryMBs) {
|
|||||||
this.pMemMBsStat.add(memoryMBs);
|
this.pMemMBsStat.add(memoryMBs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void recordCpuUsage(
|
||||||
|
int totalPhysicalCpuPercent, int milliVcoresUsed) {
|
||||||
|
this.cpuCoreUsagePercent.add(totalPhysicalCpuPercent);
|
||||||
|
this.milliVcoresUsed.add(milliVcoresUsed);
|
||||||
|
}
|
||||||
|
|
||||||
public void recordProcessId(String processId) {
|
public void recordProcessId(String processId) {
|
||||||
registry.tag(PROCESSID_INFO, processId);
|
registry.tag(PROCESSID_INFO, processId);
|
||||||
}
|
}
|
||||||
@ -177,7 +204,7 @@ public void recordProcessId(String processId) {
|
|||||||
public void recordResourceLimit(int vmemLimit, int pmemLimit, int cpuVcores) {
|
public void recordResourceLimit(int vmemLimit, int pmemLimit, int cpuVcores) {
|
||||||
this.vMemLimitMbs.set(vmemLimit);
|
this.vMemLimitMbs.set(vmemLimit);
|
||||||
this.pMemLimitMbs.set(pmemLimit);
|
this.pMemLimitMbs.set(pmemLimit);
|
||||||
this.cpuVcores.set(cpuVcores);
|
this.cpuVcoreLimit.set(cpuVcores);
|
||||||
}
|
}
|
||||||
|
|
||||||
private synchronized void scheduleTimerTaskIfRequired() {
|
private synchronized void scheduleTimerTaskIfRequired() {
|
||||||
|
@ -38,6 +38,7 @@
|
|||||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
|
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerKillEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerKillEvent;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.util.NodeManagerHardwareUtils;
|
||||||
import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
|
import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
|
||||||
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
|
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
|
||||||
|
|
||||||
@ -75,6 +76,7 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|||||||
private long maxVCoresAllottedForContainers;
|
private long maxVCoresAllottedForContainers;
|
||||||
|
|
||||||
private static final long UNKNOWN_MEMORY_LIMIT = -1L;
|
private static final long UNKNOWN_MEMORY_LIMIT = -1L;
|
||||||
|
private int nodeCpuPercentageForYARN;
|
||||||
|
|
||||||
public ContainersMonitorImpl(ContainerExecutor exec,
|
public ContainersMonitorImpl(ContainerExecutor exec,
|
||||||
AsyncDispatcher dispatcher, Context context) {
|
AsyncDispatcher dispatcher, Context context) {
|
||||||
@ -145,6 +147,9 @@ protected void serviceInit(Configuration conf) throws Exception {
|
|||||||
LOG.info("Physical memory check enabled: " + pmemCheckEnabled);
|
LOG.info("Physical memory check enabled: " + pmemCheckEnabled);
|
||||||
LOG.info("Virtual memory check enabled: " + vmemCheckEnabled);
|
LOG.info("Virtual memory check enabled: " + vmemCheckEnabled);
|
||||||
|
|
||||||
|
nodeCpuPercentageForYARN =
|
||||||
|
NodeManagerHardwareUtils.getNodeCpuPercentage(conf);
|
||||||
|
|
||||||
if (pmemCheckEnabled) {
|
if (pmemCheckEnabled) {
|
||||||
// Logging if actual pmem cannot be determined.
|
// Logging if actual pmem cannot be determined.
|
||||||
long totalPhysicalMemoryOnNM = UNKNOWN_MEMORY_LIMIT;
|
long totalPhysicalMemoryOnNM = UNKNOWN_MEMORY_LIMIT;
|
||||||
@ -434,6 +439,16 @@ public void run() {
|
|||||||
pTree.updateProcessTree(); // update process-tree
|
pTree.updateProcessTree(); // update process-tree
|
||||||
long currentVmemUsage = pTree.getCumulativeVmem();
|
long currentVmemUsage = pTree.getCumulativeVmem();
|
||||||
long currentPmemUsage = pTree.getCumulativeRssmem();
|
long currentPmemUsage = pTree.getCumulativeRssmem();
|
||||||
|
// if machine has 6 cores and 3 are used,
|
||||||
|
// cpuUsagePercentPerCore should be 300% and
|
||||||
|
// cpuUsageTotalCoresPercentage should be 50%
|
||||||
|
float cpuUsagePercentPerCore = pTree.getCpuUsagePercent();
|
||||||
|
float cpuUsageTotalCoresPercentage = cpuUsagePercentPerCore /
|
||||||
|
resourceCalculatorPlugin.getNumProcessors();
|
||||||
|
|
||||||
|
// Multiply by 1000 to avoid losing data when converting to int
|
||||||
|
int milliVcoresUsed = (int) (cpuUsageTotalCoresPercentage * 1000
|
||||||
|
* maxVCoresAllottedForContainers /nodeCpuPercentageForYARN);
|
||||||
// as processes begin with an age 1, we want to see if there
|
// as processes begin with an age 1, we want to see if there
|
||||||
// are processes more than 1 iteration old.
|
// are processes more than 1 iteration old.
|
||||||
long curMemUsageOfAgedProcesses = pTree.getCumulativeVmem(1);
|
long curMemUsageOfAgedProcesses = pTree.getCumulativeVmem(1);
|
||||||
@ -451,6 +466,9 @@ public void run() {
|
|||||||
ContainerMetrics.forContainer(
|
ContainerMetrics.forContainer(
|
||||||
containerId, containerMetricsPeriodMs).recordMemoryUsage(
|
containerId, containerMetricsPeriodMs).recordMemoryUsage(
|
||||||
(int) (currentPmemUsage >> 20));
|
(int) (currentPmemUsage >> 20));
|
||||||
|
ContainerMetrics.forContainer(
|
||||||
|
containerId, containerMetricsPeriodMs).recordCpuUsage
|
||||||
|
((int)cpuUsagePercentPerCore, milliVcoresUsed);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean isMemoryOverLimit = false;
|
boolean isMemoryOverLimit = false;
|
||||||
|
@ -59,6 +59,19 @@ public static float getContainersCores(Configuration conf) {
|
|||||||
public static float getContainersCores(ResourceCalculatorPlugin plugin,
|
public static float getContainersCores(ResourceCalculatorPlugin plugin,
|
||||||
Configuration conf) {
|
Configuration conf) {
|
||||||
int numProcessors = plugin.getNumProcessors();
|
int numProcessors = plugin.getNumProcessors();
|
||||||
|
int nodeCpuPercentage = getNodeCpuPercentage(conf);
|
||||||
|
|
||||||
|
return (nodeCpuPercentage * numProcessors) / 100.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the percentage of physical CPU that is configured for YARN containers
|
||||||
|
* This is percent > 0 and <= 100 based on
|
||||||
|
* YarnConfiguration.NM_RESOURCE_PERCENTAGE_PHYSICAL_CPU_LIMIT
|
||||||
|
* @param conf Configuration object
|
||||||
|
* @return percent > 0 and <= 100
|
||||||
|
*/
|
||||||
|
public static int getNodeCpuPercentage(Configuration conf) {
|
||||||
int nodeCpuPercentage =
|
int nodeCpuPercentage =
|
||||||
Math.min(conf.getInt(
|
Math.min(conf.getInt(
|
||||||
YarnConfiguration.NM_RESOURCE_PERCENTAGE_PHYSICAL_CPU_LIMIT,
|
YarnConfiguration.NM_RESOURCE_PERCENTAGE_PHYSICAL_CPU_LIMIT,
|
||||||
@ -73,7 +86,6 @@ public static float getContainersCores(ResourceCalculatorPlugin plugin,
|
|||||||
+ ". Value cannot be less than or equal to 0.";
|
+ ". Value cannot be less than or equal to 0.";
|
||||||
throw new IllegalArgumentException(message);
|
throw new IllegalArgumentException(message);
|
||||||
}
|
}
|
||||||
|
return nodeCpuPercentage;
|
||||||
return (nodeCpuPercentage * numProcessors) / 100.0f;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user