HDFS-13946. Log longest FSN write/read lock held stack trace.
This commit is contained in:
parent
763e96ea2b
commit
feb2664ac4
@ -272,6 +272,22 @@ public LogAction record(String recorderName, long currentTimeMs,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the summary information for given index.
|
||||||
|
*
|
||||||
|
* @param recorderName The name of the recorder.
|
||||||
|
* @param idx The index value.
|
||||||
|
* @return The summary information.
|
||||||
|
*/
|
||||||
|
public SummaryStatistics getCurrentStats(String recorderName, int idx) {
|
||||||
|
LoggingAction currentLog = currentLogs.get(recorderName);
|
||||||
|
if (currentLog != null) {
|
||||||
|
return currentLog.getStats(idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A standard log action which keeps track of all of the values which have
|
* A standard log action which keeps track of all of the values which have
|
||||||
* been logged. This is also used for internal bookkeeping via its private
|
* been logged. This is also used for internal bookkeeping via its private
|
||||||
|
@ -167,6 +167,9 @@ public void testMultipleLoggersWithValues() {
|
|||||||
assertEquals(2.0, bar.getStats(0).getMean(), 0.01);
|
assertEquals(2.0, bar.getStats(0).getMean(), 0.01);
|
||||||
assertEquals(3.0, baz.getStats(0).getMean(), 0.01);
|
assertEquals(3.0, baz.getStats(0).getMean(), 0.01);
|
||||||
assertEquals(3.0, baz.getStats(1).getMean(), 0.01);
|
assertEquals(3.0, baz.getStats(1).getMean(), 0.01);
|
||||||
|
|
||||||
|
assertEquals(2.0, helper.getCurrentStats("bar", 0).getMax(), 0);
|
||||||
|
assertEquals(3.0, helper.getCurrentStats("baz", 0).getMax(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -21,10 +21,12 @@
|
|||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
import java.util.concurrent.locks.Condition;
|
import java.util.concurrent.locks.Condition;
|
||||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.log.LogThrottlingHelper;
|
import org.apache.hadoop.log.LogThrottlingHelper;
|
||||||
import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation;
|
import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation;
|
||||||
@ -97,8 +99,18 @@ public Long initialValue() {
|
|||||||
new AtomicInteger(0);
|
new AtomicInteger(0);
|
||||||
/** Time stamp (ms) of the last time a read lock report was written. */
|
/** Time stamp (ms) of the last time a read lock report was written. */
|
||||||
private final AtomicLong timeStampOfLastReadLockReportMs = new AtomicLong(0);
|
private final AtomicLong timeStampOfLastReadLockReportMs = new AtomicLong(0);
|
||||||
/** Longest time (ms) a read lock was held since the last report. */
|
/**
|
||||||
private final AtomicLong longestReadLockHeldIntervalMs = new AtomicLong(0);
|
* The info (lock held time and stack trace) when longest time (ms) a read
|
||||||
|
* lock was held since the last report.
|
||||||
|
*/
|
||||||
|
private final AtomicReference<ReadLockHeldInfo> longestReadLockHeldInfo =
|
||||||
|
new AtomicReference<>(new ReadLockHeldInfo(0, null));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The stack trace when longest time of the write lock
|
||||||
|
* was held since the last report.
|
||||||
|
*/
|
||||||
|
private volatile String longestWriteLockHeldStackTrace;
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
static final String OP_NAME_OTHER = "OTHER";
|
static final String OP_NAME_OTHER = "OTHER";
|
||||||
@ -172,12 +184,13 @@ public void readUnlock(String opName) {
|
|||||||
final long readLockIntervalMs =
|
final long readLockIntervalMs =
|
||||||
TimeUnit.NANOSECONDS.toMillis(readLockIntervalNanos);
|
TimeUnit.NANOSECONDS.toMillis(readLockIntervalNanos);
|
||||||
if (needReport && readLockIntervalMs >= this.readLockReportingThresholdMs) {
|
if (needReport && readLockIntervalMs >= this.readLockReportingThresholdMs) {
|
||||||
long localLongestReadLock;
|
ReadLockHeldInfo localLockHeldInfo;
|
||||||
do {
|
do {
|
||||||
localLongestReadLock = longestReadLockHeldIntervalMs.get();
|
localLockHeldInfo = longestReadLockHeldInfo.get();
|
||||||
} while (localLongestReadLock - readLockIntervalMs < 0 &&
|
} while (localLockHeldInfo.getIntervalMs() - readLockIntervalMs < 0 &&
|
||||||
!longestReadLockHeldIntervalMs.compareAndSet(localLongestReadLock,
|
!longestReadLockHeldInfo.compareAndSet(localLockHeldInfo,
|
||||||
readLockIntervalMs));
|
new ReadLockHeldInfo(readLockIntervalMs,
|
||||||
|
StringUtils.getStackTrace(Thread.currentThread()))));
|
||||||
|
|
||||||
long localTimeStampOfLastReadLockReport;
|
long localTimeStampOfLastReadLockReport;
|
||||||
long nowMs;
|
long nowMs;
|
||||||
@ -193,13 +206,13 @@ public void readUnlock(String opName) {
|
|||||||
} while (!timeStampOfLastReadLockReportMs.compareAndSet(
|
} while (!timeStampOfLastReadLockReportMs.compareAndSet(
|
||||||
localTimeStampOfLastReadLockReport, nowMs));
|
localTimeStampOfLastReadLockReport, nowMs));
|
||||||
int numSuppressedWarnings = numReadLockWarningsSuppressed.getAndSet(0);
|
int numSuppressedWarnings = numReadLockWarningsSuppressed.getAndSet(0);
|
||||||
long longestLockHeldIntervalMs =
|
ReadLockHeldInfo lockHeldInfo = longestReadLockHeldInfo
|
||||||
longestReadLockHeldIntervalMs.getAndSet(0);
|
.getAndSet(new ReadLockHeldInfo(0, null));
|
||||||
FSNamesystem.LOG.info("FSNamesystem read lock held for " +
|
FSNamesystem.LOG.info(
|
||||||
readLockIntervalMs + " ms via\n" +
|
"\tNumber of suppressed read-lock reports: {}" +
|
||||||
StringUtils.getStackTrace(Thread.currentThread()) +
|
"\n\tLongest read-lock held interval: {}ms via {}",
|
||||||
"\tNumber of suppressed read-lock reports: " + numSuppressedWarnings +
|
numSuppressedWarnings, lockHeldInfo.getIntervalMs(),
|
||||||
"\n\tLongest read-lock held interval: " + longestLockHeldIntervalMs);
|
lockHeldInfo.getStackTrace());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -255,6 +268,14 @@ public void writeUnlock(String opName, boolean suppressWriteLockReport) {
|
|||||||
LogAction logAction = LogThrottlingHelper.DO_NOT_LOG;
|
LogAction logAction = LogThrottlingHelper.DO_NOT_LOG;
|
||||||
if (needReport &&
|
if (needReport &&
|
||||||
writeLockIntervalMs >= this.writeLockReportingThresholdMs) {
|
writeLockIntervalMs >= this.writeLockReportingThresholdMs) {
|
||||||
|
SummaryStatistics currentStats =
|
||||||
|
writeLockReportLogger.getCurrentStats("write", 0);
|
||||||
|
double currentMaxTime = currentStats != null ? currentStats.getMax() : 0;
|
||||||
|
if (currentMaxTime < writeLockIntervalMs) {
|
||||||
|
longestWriteLockHeldStackTrace =
|
||||||
|
StringUtils.getStackTrace(Thread.currentThread());
|
||||||
|
}
|
||||||
|
|
||||||
logAction = writeLockReportLogger
|
logAction = writeLockReportLogger
|
||||||
.record("write", currentTimeMs, writeLockIntervalMs);
|
.record("write", currentTimeMs, writeLockIntervalMs);
|
||||||
}
|
}
|
||||||
@ -266,12 +287,12 @@ public void writeUnlock(String opName, boolean suppressWriteLockReport) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (logAction.shouldLog()) {
|
if (logAction.shouldLog()) {
|
||||||
FSNamesystem.LOG.info("FSNamesystem write lock held for {} ms via {}\t" +
|
FSNamesystem.LOG.info(
|
||||||
"Number of suppressed write-lock reports: {}\n\tLongest write-lock " +
|
"\tNumber of suppressed write-lock reports: {}" +
|
||||||
"held interval: {} \n\tTotal suppressed write-lock held time: {}",
|
"\n\tLongest write-lock held interval: {}ms via {} " +
|
||||||
writeLockIntervalMs,
|
"\n\tTotal suppressed write-lock held time: {}",
|
||||||
StringUtils.getStackTrace(Thread.currentThread()),
|
|
||||||
logAction.getCount() - 1, logAction.getStats(0).getMax(),
|
logAction.getCount() - 1, logAction.getStats(0).getMax(),
|
||||||
|
longestWriteLockHeldStackTrace,
|
||||||
logAction.getStats(0).getSum() - writeLockIntervalMs);
|
logAction.getStats(0).getSum() - writeLockIntervalMs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -323,4 +344,38 @@ private static String getMetricName(String operationName, boolean isWrite) {
|
|||||||
org.apache.commons.lang3.StringUtils.capitalize(operationName) +
|
org.apache.commons.lang3.StringUtils.capitalize(operationName) +
|
||||||
LOCK_METRIC_SUFFIX;
|
LOCK_METRIC_SUFFIX;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read lock Held Info.
|
||||||
|
*/
|
||||||
|
private static class ReadLockHeldInfo {
|
||||||
|
/** Read lock held time. */
|
||||||
|
private Long intervalMs;
|
||||||
|
/** The stack trace read lock was held. */
|
||||||
|
private String stackTrace;
|
||||||
|
|
||||||
|
ReadLockHeldInfo(long intervalMs, String stackTrace) {
|
||||||
|
this.intervalMs = intervalMs;
|
||||||
|
this.stackTrace = stackTrace;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Long getIntervalMs() {
|
||||||
|
return this.intervalMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getStackTrace() {
|
||||||
|
return this.stackTrace;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return this.intervalMs.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
return obj instanceof ReadLockHeldInfo && ((ReadLockHeldInfo) obj)
|
||||||
|
.getIntervalMs().compareTo(intervalMs) == 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -202,7 +202,11 @@ public void testFSWriteLockLongHoldingReport() throws Exception {
|
|||||||
timer.advance(writeLockReportingThreshold + 100);
|
timer.advance(writeLockReportingThreshold + 100);
|
||||||
logs.clearOutput();
|
logs.clearOutput();
|
||||||
fsnLock.writeUnlock();
|
fsnLock.writeUnlock();
|
||||||
|
// look for the method name in the stack trace
|
||||||
assertTrue(logs.getOutput().contains(GenericTestUtils.getMethodName()));
|
assertTrue(logs.getOutput().contains(GenericTestUtils.getMethodName()));
|
||||||
|
// find the held interval time in the log
|
||||||
|
Pattern pattern = Pattern.compile(".*[\n].*\\d+ms(.*[\n].*){1,}");
|
||||||
|
assertTrue(pattern.matcher(logs.getOutput()).find());
|
||||||
assertTrue(logs.getOutput().contains(
|
assertTrue(logs.getOutput().contains(
|
||||||
"Number of suppressed write-lock reports: 2"));
|
"Number of suppressed write-lock reports: 2"));
|
||||||
}
|
}
|
||||||
@ -215,7 +219,7 @@ public void testFSWriteLockLongHoldingReport() throws Exception {
|
|||||||
public void testFSReadLockLongHoldingReport() throws Exception {
|
public void testFSReadLockLongHoldingReport() throws Exception {
|
||||||
final long readLockReportingThreshold = 100L;
|
final long readLockReportingThreshold = 100L;
|
||||||
final long readLockSuppressWarningInterval = 10000L;
|
final long readLockSuppressWarningInterval = 10000L;
|
||||||
final String readLockLogStmt = "FSNamesystem read lock held for ";
|
final String readLockLogStmt = "Number of suppressed read-lock reports";
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
conf.setLong(
|
conf.setLong(
|
||||||
DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY,
|
DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY,
|
||||||
@ -256,6 +260,18 @@ public void testFSReadLockLongHoldingReport() throws Exception {
|
|||||||
// Track but do not Report if it's held for a long time when re-entering
|
// Track but do not Report if it's held for a long time when re-entering
|
||||||
// read lock but time since last report does not exceed the suppress
|
// read lock but time since last report does not exceed the suppress
|
||||||
// warning interval
|
// warning interval
|
||||||
|
Thread tLong = new Thread() {
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
fsnLock.readLock();
|
||||||
|
// Add one lock hold which is the longest, but occurs under a different
|
||||||
|
// stack trace, to ensure this is the one that gets logged
|
||||||
|
timer.advance(readLockReportingThreshold + 20);
|
||||||
|
fsnLock.readUnlock();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
tLong.start();
|
||||||
|
tLong.join();
|
||||||
fsnLock.readLock();
|
fsnLock.readLock();
|
||||||
timer.advance(readLockReportingThreshold / 2 + 1);
|
timer.advance(readLockReportingThreshold / 2 + 1);
|
||||||
fsnLock.readLock();
|
fsnLock.readLock();
|
||||||
@ -268,6 +284,18 @@ public void testFSReadLockLongHoldingReport() throws Exception {
|
|||||||
fsnLock.readUnlock();
|
fsnLock.readUnlock();
|
||||||
assertFalse(logs.getOutput().contains(GenericTestUtils.getMethodName()) &&
|
assertFalse(logs.getOutput().contains(GenericTestUtils.getMethodName()) &&
|
||||||
logs.getOutput().contains(readLockLogStmt));
|
logs.getOutput().contains(readLockLogStmt));
|
||||||
|
timer.advance(readLockSuppressWarningInterval);
|
||||||
|
fsnLock.readLock();
|
||||||
|
timer.advance(readLockReportingThreshold + 1);
|
||||||
|
fsnLock.readUnlock();
|
||||||
|
// Assert that stack trace eventually logged is the one for the longest hold
|
||||||
|
String stackTracePatternString =
|
||||||
|
String.format("INFO.+%s(.+\n){5}\\Q%%s\\E\\.run", readLockLogStmt);
|
||||||
|
Pattern tLongPattern = Pattern.compile(
|
||||||
|
String.format(stackTracePatternString, tLong.getClass().getName()));
|
||||||
|
assertTrue(tLongPattern.matcher(logs.getOutput()).find());
|
||||||
|
assertTrue(logs.getOutput().contains(
|
||||||
|
"Number of suppressed read-lock reports: 3"));
|
||||||
|
|
||||||
// Report if it's held for a long time (and time since last report
|
// Report if it's held for a long time (and time since last report
|
||||||
// exceeds the suppress warning interval) while another thread also has the
|
// exceeds the suppress warning interval) while another thread also has the
|
||||||
@ -310,16 +338,15 @@ public void run() {
|
|||||||
t1.join();
|
t1.join();
|
||||||
t2.join();
|
t2.join();
|
||||||
// Look for the differentiating class names in the stack trace
|
// Look for the differentiating class names in the stack trace
|
||||||
String stackTracePatternString =
|
|
||||||
String.format("INFO.+%s(.+\n){5}\\Q%%s\\E\\.run", readLockLogStmt);
|
|
||||||
Pattern t1Pattern = Pattern.compile(
|
Pattern t1Pattern = Pattern.compile(
|
||||||
String.format(stackTracePatternString, t1.getClass().getName()));
|
String.format(stackTracePatternString, t1.getClass().getName()));
|
||||||
assertTrue(t1Pattern.matcher(logs.getOutput()).find());
|
assertTrue(t1Pattern.matcher(logs.getOutput()).find());
|
||||||
Pattern t2Pattern = Pattern.compile(
|
Pattern t2Pattern = Pattern.compile(
|
||||||
String.format(stackTracePatternString, t2.getClass().getName()));
|
String.format(stackTracePatternString, t2.getClass().getName()));
|
||||||
assertFalse(t2Pattern.matcher(logs.getOutput()).find());
|
assertFalse(t2Pattern.matcher(logs.getOutput()).find());
|
||||||
assertTrue(logs.getOutput().contains(
|
// match the held interval time in the log
|
||||||
"Number of suppressed read-lock reports: 2"));
|
Pattern pattern = Pattern.compile(".*[\n].*\\d+ms(.*[\n].*){1,}");
|
||||||
|
assertTrue(pattern.matcher(logs.getOutput()).find());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@ -396,7 +423,7 @@ public void testFSWriteLockReportSuppressed() throws Exception {
|
|||||||
timer.advance(writeLockReportingThreshold + 100);
|
timer.advance(writeLockReportingThreshold + 100);
|
||||||
fsnLock.writeUnlock();
|
fsnLock.writeUnlock();
|
||||||
assertTrue(logs.getOutput().contains(
|
assertTrue(logs.getOutput().contains(
|
||||||
"FSNamesystem write lock held for"));
|
"Number of suppressed write-lock reports"));
|
||||||
|
|
||||||
logs.clearOutput();
|
logs.clearOutput();
|
||||||
|
|
||||||
@ -407,8 +434,6 @@ public void testFSWriteLockReportSuppressed() throws Exception {
|
|||||||
assertFalse(logs.getOutput().contains(GenericTestUtils.getMethodName()));
|
assertFalse(logs.getOutput().contains(GenericTestUtils.getMethodName()));
|
||||||
assertFalse(logs.getOutput().contains(
|
assertFalse(logs.getOutput().contains(
|
||||||
"Number of suppressed write-lock reports:"));
|
"Number of suppressed write-lock reports:"));
|
||||||
assertFalse(logs.getOutput().contains(
|
|
||||||
"FSNamesystem write lock held for"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user