HDFS-13946. Log longest FSN write/read lock held stack trace.
This commit is contained in:
parent
763e96ea2b
commit
feb2664ac4
@ -272,6 +272,22 @@ public LogAction record(String recorderName, long currentTimeMs,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the summary information for given index.
|
||||
*
|
||||
* @param recorderName The name of the recorder.
|
||||
* @param idx The index value.
|
||||
* @return The summary information.
|
||||
*/
|
||||
public SummaryStatistics getCurrentStats(String recorderName, int idx) {
|
||||
LoggingAction currentLog = currentLogs.get(recorderName);
|
||||
if (currentLog != null) {
|
||||
return currentLog.getStats(idx);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* A standard log action which keeps track of all of the values which have
|
||||
* been logged. This is also used for internal bookkeeping via its private
|
||||
|
@ -167,6 +167,9 @@ public void testMultipleLoggersWithValues() {
|
||||
assertEquals(2.0, bar.getStats(0).getMean(), 0.01);
|
||||
assertEquals(3.0, baz.getStats(0).getMean(), 0.01);
|
||||
assertEquals(3.0, baz.getStats(1).getMean(), 0.01);
|
||||
|
||||
assertEquals(2.0, helper.getCurrentStats("bar", 0).getMax(), 0);
|
||||
assertEquals(3.0, helper.getCurrentStats("baz", 0).getMax(), 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -21,10 +21,12 @@
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import java.util.concurrent.locks.Condition;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.log.LogThrottlingHelper;
|
||||
import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation;
|
||||
@ -97,8 +99,18 @@ public Long initialValue() {
|
||||
new AtomicInteger(0);
|
||||
/** Time stamp (ms) of the last time a read lock report was written. */
|
||||
private final AtomicLong timeStampOfLastReadLockReportMs = new AtomicLong(0);
|
||||
/** Longest time (ms) a read lock was held since the last report. */
|
||||
private final AtomicLong longestReadLockHeldIntervalMs = new AtomicLong(0);
|
||||
/**
|
||||
* The info (lock held time and stack trace) when longest time (ms) a read
|
||||
* lock was held since the last report.
|
||||
*/
|
||||
private final AtomicReference<ReadLockHeldInfo> longestReadLockHeldInfo =
|
||||
new AtomicReference<>(new ReadLockHeldInfo(0, null));
|
||||
|
||||
/**
|
||||
* The stack trace when longest time of the write lock
|
||||
* was held since the last report.
|
||||
*/
|
||||
private volatile String longestWriteLockHeldStackTrace;
|
||||
|
||||
@VisibleForTesting
|
||||
static final String OP_NAME_OTHER = "OTHER";
|
||||
@ -172,12 +184,13 @@ public void readUnlock(String opName) {
|
||||
final long readLockIntervalMs =
|
||||
TimeUnit.NANOSECONDS.toMillis(readLockIntervalNanos);
|
||||
if (needReport && readLockIntervalMs >= this.readLockReportingThresholdMs) {
|
||||
long localLongestReadLock;
|
||||
ReadLockHeldInfo localLockHeldInfo;
|
||||
do {
|
||||
localLongestReadLock = longestReadLockHeldIntervalMs.get();
|
||||
} while (localLongestReadLock - readLockIntervalMs < 0 &&
|
||||
!longestReadLockHeldIntervalMs.compareAndSet(localLongestReadLock,
|
||||
readLockIntervalMs));
|
||||
localLockHeldInfo = longestReadLockHeldInfo.get();
|
||||
} while (localLockHeldInfo.getIntervalMs() - readLockIntervalMs < 0 &&
|
||||
!longestReadLockHeldInfo.compareAndSet(localLockHeldInfo,
|
||||
new ReadLockHeldInfo(readLockIntervalMs,
|
||||
StringUtils.getStackTrace(Thread.currentThread()))));
|
||||
|
||||
long localTimeStampOfLastReadLockReport;
|
||||
long nowMs;
|
||||
@ -193,13 +206,13 @@ public void readUnlock(String opName) {
|
||||
} while (!timeStampOfLastReadLockReportMs.compareAndSet(
|
||||
localTimeStampOfLastReadLockReport, nowMs));
|
||||
int numSuppressedWarnings = numReadLockWarningsSuppressed.getAndSet(0);
|
||||
long longestLockHeldIntervalMs =
|
||||
longestReadLockHeldIntervalMs.getAndSet(0);
|
||||
FSNamesystem.LOG.info("FSNamesystem read lock held for " +
|
||||
readLockIntervalMs + " ms via\n" +
|
||||
StringUtils.getStackTrace(Thread.currentThread()) +
|
||||
"\tNumber of suppressed read-lock reports: " + numSuppressedWarnings +
|
||||
"\n\tLongest read-lock held interval: " + longestLockHeldIntervalMs);
|
||||
ReadLockHeldInfo lockHeldInfo = longestReadLockHeldInfo
|
||||
.getAndSet(new ReadLockHeldInfo(0, null));
|
||||
FSNamesystem.LOG.info(
|
||||
"\tNumber of suppressed read-lock reports: {}" +
|
||||
"\n\tLongest read-lock held interval: {}ms via {}",
|
||||
numSuppressedWarnings, lockHeldInfo.getIntervalMs(),
|
||||
lockHeldInfo.getStackTrace());
|
||||
}
|
||||
}
|
||||
|
||||
@ -255,6 +268,14 @@ public void writeUnlock(String opName, boolean suppressWriteLockReport) {
|
||||
LogAction logAction = LogThrottlingHelper.DO_NOT_LOG;
|
||||
if (needReport &&
|
||||
writeLockIntervalMs >= this.writeLockReportingThresholdMs) {
|
||||
SummaryStatistics currentStats =
|
||||
writeLockReportLogger.getCurrentStats("write", 0);
|
||||
double currentMaxTime = currentStats != null ? currentStats.getMax() : 0;
|
||||
if (currentMaxTime < writeLockIntervalMs) {
|
||||
longestWriteLockHeldStackTrace =
|
||||
StringUtils.getStackTrace(Thread.currentThread());
|
||||
}
|
||||
|
||||
logAction = writeLockReportLogger
|
||||
.record("write", currentTimeMs, writeLockIntervalMs);
|
||||
}
|
||||
@ -266,12 +287,12 @@ public void writeUnlock(String opName, boolean suppressWriteLockReport) {
|
||||
}
|
||||
|
||||
if (logAction.shouldLog()) {
|
||||
FSNamesystem.LOG.info("FSNamesystem write lock held for {} ms via {}\t" +
|
||||
"Number of suppressed write-lock reports: {}\n\tLongest write-lock " +
|
||||
"held interval: {} \n\tTotal suppressed write-lock held time: {}",
|
||||
writeLockIntervalMs,
|
||||
StringUtils.getStackTrace(Thread.currentThread()),
|
||||
FSNamesystem.LOG.info(
|
||||
"\tNumber of suppressed write-lock reports: {}" +
|
||||
"\n\tLongest write-lock held interval: {}ms via {} " +
|
||||
"\n\tTotal suppressed write-lock held time: {}",
|
||||
logAction.getCount() - 1, logAction.getStats(0).getMax(),
|
||||
longestWriteLockHeldStackTrace,
|
||||
logAction.getStats(0).getSum() - writeLockIntervalMs);
|
||||
}
|
||||
}
|
||||
@ -323,4 +344,38 @@ private static String getMetricName(String operationName, boolean isWrite) {
|
||||
org.apache.commons.lang3.StringUtils.capitalize(operationName) +
|
||||
LOCK_METRIC_SUFFIX;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read lock Held Info.
|
||||
*/
|
||||
private static class ReadLockHeldInfo {
|
||||
/** Read lock held time. */
|
||||
private Long intervalMs;
|
||||
/** The stack trace read lock was held. */
|
||||
private String stackTrace;
|
||||
|
||||
ReadLockHeldInfo(long intervalMs, String stackTrace) {
|
||||
this.intervalMs = intervalMs;
|
||||
this.stackTrace = stackTrace;
|
||||
}
|
||||
|
||||
public Long getIntervalMs() {
|
||||
return this.intervalMs;
|
||||
}
|
||||
|
||||
public String getStackTrace() {
|
||||
return this.stackTrace;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return this.intervalMs.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return obj instanceof ReadLockHeldInfo && ((ReadLockHeldInfo) obj)
|
||||
.getIntervalMs().compareTo(intervalMs) == 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -202,7 +202,11 @@ public void testFSWriteLockLongHoldingReport() throws Exception {
|
||||
timer.advance(writeLockReportingThreshold + 100);
|
||||
logs.clearOutput();
|
||||
fsnLock.writeUnlock();
|
||||
// look for the method name in the stack trace
|
||||
assertTrue(logs.getOutput().contains(GenericTestUtils.getMethodName()));
|
||||
// find the held interval time in the log
|
||||
Pattern pattern = Pattern.compile(".*[\n].*\\d+ms(.*[\n].*){1,}");
|
||||
assertTrue(pattern.matcher(logs.getOutput()).find());
|
||||
assertTrue(logs.getOutput().contains(
|
||||
"Number of suppressed write-lock reports: 2"));
|
||||
}
|
||||
@ -215,7 +219,7 @@ public void testFSWriteLockLongHoldingReport() throws Exception {
|
||||
public void testFSReadLockLongHoldingReport() throws Exception {
|
||||
final long readLockReportingThreshold = 100L;
|
||||
final long readLockSuppressWarningInterval = 10000L;
|
||||
final String readLockLogStmt = "FSNamesystem read lock held for ";
|
||||
final String readLockLogStmt = "Number of suppressed read-lock reports";
|
||||
Configuration conf = new Configuration();
|
||||
conf.setLong(
|
||||
DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY,
|
||||
@ -256,6 +260,18 @@ public void testFSReadLockLongHoldingReport() throws Exception {
|
||||
// Track but do not Report if it's held for a long time when re-entering
|
||||
// read lock but time since last report does not exceed the suppress
|
||||
// warning interval
|
||||
Thread tLong = new Thread() {
|
||||
@Override
|
||||
public void run() {
|
||||
fsnLock.readLock();
|
||||
// Add one lock hold which is the longest, but occurs under a different
|
||||
// stack trace, to ensure this is the one that gets logged
|
||||
timer.advance(readLockReportingThreshold + 20);
|
||||
fsnLock.readUnlock();
|
||||
}
|
||||
};
|
||||
tLong.start();
|
||||
tLong.join();
|
||||
fsnLock.readLock();
|
||||
timer.advance(readLockReportingThreshold / 2 + 1);
|
||||
fsnLock.readLock();
|
||||
@ -268,6 +284,18 @@ public void testFSReadLockLongHoldingReport() throws Exception {
|
||||
fsnLock.readUnlock();
|
||||
assertFalse(logs.getOutput().contains(GenericTestUtils.getMethodName()) &&
|
||||
logs.getOutput().contains(readLockLogStmt));
|
||||
timer.advance(readLockSuppressWarningInterval);
|
||||
fsnLock.readLock();
|
||||
timer.advance(readLockReportingThreshold + 1);
|
||||
fsnLock.readUnlock();
|
||||
// Assert that stack trace eventually logged is the one for the longest hold
|
||||
String stackTracePatternString =
|
||||
String.format("INFO.+%s(.+\n){5}\\Q%%s\\E\\.run", readLockLogStmt);
|
||||
Pattern tLongPattern = Pattern.compile(
|
||||
String.format(stackTracePatternString, tLong.getClass().getName()));
|
||||
assertTrue(tLongPattern.matcher(logs.getOutput()).find());
|
||||
assertTrue(logs.getOutput().contains(
|
||||
"Number of suppressed read-lock reports: 3"));
|
||||
|
||||
// Report if it's held for a long time (and time since last report
|
||||
// exceeds the suppress warning interval) while another thread also has the
|
||||
@ -310,16 +338,15 @@ public void run() {
|
||||
t1.join();
|
||||
t2.join();
|
||||
// Look for the differentiating class names in the stack trace
|
||||
String stackTracePatternString =
|
||||
String.format("INFO.+%s(.+\n){5}\\Q%%s\\E\\.run", readLockLogStmt);
|
||||
Pattern t1Pattern = Pattern.compile(
|
||||
String.format(stackTracePatternString, t1.getClass().getName()));
|
||||
assertTrue(t1Pattern.matcher(logs.getOutput()).find());
|
||||
Pattern t2Pattern = Pattern.compile(
|
||||
String.format(stackTracePatternString, t2.getClass().getName()));
|
||||
assertFalse(t2Pattern.matcher(logs.getOutput()).find());
|
||||
assertTrue(logs.getOutput().contains(
|
||||
"Number of suppressed read-lock reports: 2"));
|
||||
// match the held interval time in the log
|
||||
Pattern pattern = Pattern.compile(".*[\n].*\\d+ms(.*[\n].*){1,}");
|
||||
assertTrue(pattern.matcher(logs.getOutput()).find());
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -396,7 +423,7 @@ public void testFSWriteLockReportSuppressed() throws Exception {
|
||||
timer.advance(writeLockReportingThreshold + 100);
|
||||
fsnLock.writeUnlock();
|
||||
assertTrue(logs.getOutput().contains(
|
||||
"FSNamesystem write lock held for"));
|
||||
"Number of suppressed write-lock reports"));
|
||||
|
||||
logs.clearOutput();
|
||||
|
||||
@ -407,8 +434,6 @@ public void testFSWriteLockReportSuppressed() throws Exception {
|
||||
assertFalse(logs.getOutput().contains(GenericTestUtils.getMethodName()));
|
||||
assertFalse(logs.getOutput().contains(
|
||||
"Number of suppressed write-lock reports:"));
|
||||
assertFalse(logs.getOutput().contains(
|
||||
"FSNamesystem write lock held for"));
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user