HDFS-13946. Log longest FSN write/read lock held stack trace.

This commit is contained in:
Yiqun Lin 2018-12-22 23:09:59 +08:00
parent 763e96ea2b
commit feb2664ac4
4 changed files with 126 additions and 27 deletions

View File

@ -272,6 +272,22 @@ public class LogThrottlingHelper {
}
}
/**
* Return the summary information for given index.
*
* @param recorderName The name of the recorder.
* @param idx The index value.
* @return The summary information.
*/
public SummaryStatistics getCurrentStats(String recorderName, int idx) {
LoggingAction currentLog = currentLogs.get(recorderName);
if (currentLog != null) {
return currentLog.getStats(idx);
}
return null;
}
/**
* A standard log action which keeps track of all of the values which have
* been logged. This is also used for internal bookkeeping via its private

View File

@ -167,6 +167,9 @@ public class TestLogThrottlingHelper {
assertEquals(2.0, bar.getStats(0).getMean(), 0.01);
assertEquals(3.0, baz.getStats(0).getMean(), 0.01);
assertEquals(3.0, baz.getStats(1).getMean(), 0.01);
assertEquals(2.0, helper.getCurrentStats("bar", 0).getMax(), 0);
assertEquals(3.0, helper.getCurrentStats("baz", 0).getMax(), 0);
}
}

View File

@ -21,10 +21,12 @@ package org.apache.hadoop.hdfs.server.namenode;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.log.LogThrottlingHelper;
import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation;
@ -97,8 +99,18 @@ class FSNamesystemLock {
new AtomicInteger(0);
/** Time stamp (ms) of the last time a read lock report was written. */
private final AtomicLong timeStampOfLastReadLockReportMs = new AtomicLong(0);
/** Longest time (ms) a read lock was held since the last report. */
private final AtomicLong longestReadLockHeldIntervalMs = new AtomicLong(0);
/**
* The info (lock held time and stack trace) when longest time (ms) a read
* lock was held since the last report.
*/
private final AtomicReference<ReadLockHeldInfo> longestReadLockHeldInfo =
new AtomicReference<>(new ReadLockHeldInfo(0, null));
/**
* The stack trace when longest time of the write lock
* was held since the last report.
*/
private volatile String longestWriteLockHeldStackTrace;
@VisibleForTesting
static final String OP_NAME_OTHER = "OTHER";
@ -172,12 +184,13 @@ class FSNamesystemLock {
final long readLockIntervalMs =
TimeUnit.NANOSECONDS.toMillis(readLockIntervalNanos);
if (needReport && readLockIntervalMs >= this.readLockReportingThresholdMs) {
long localLongestReadLock;
ReadLockHeldInfo localLockHeldInfo;
do {
localLongestReadLock = longestReadLockHeldIntervalMs.get();
} while (localLongestReadLock - readLockIntervalMs < 0 &&
!longestReadLockHeldIntervalMs.compareAndSet(localLongestReadLock,
readLockIntervalMs));
localLockHeldInfo = longestReadLockHeldInfo.get();
} while (localLockHeldInfo.getIntervalMs() - readLockIntervalMs < 0 &&
!longestReadLockHeldInfo.compareAndSet(localLockHeldInfo,
new ReadLockHeldInfo(readLockIntervalMs,
StringUtils.getStackTrace(Thread.currentThread()))));
long localTimeStampOfLastReadLockReport;
long nowMs;
@ -193,13 +206,13 @@ class FSNamesystemLock {
} while (!timeStampOfLastReadLockReportMs.compareAndSet(
localTimeStampOfLastReadLockReport, nowMs));
int numSuppressedWarnings = numReadLockWarningsSuppressed.getAndSet(0);
long longestLockHeldIntervalMs =
longestReadLockHeldIntervalMs.getAndSet(0);
FSNamesystem.LOG.info("FSNamesystem read lock held for " +
readLockIntervalMs + " ms via\n" +
StringUtils.getStackTrace(Thread.currentThread()) +
"\tNumber of suppressed read-lock reports: " + numSuppressedWarnings +
"\n\tLongest read-lock held interval: " + longestLockHeldIntervalMs);
ReadLockHeldInfo lockHeldInfo = longestReadLockHeldInfo
.getAndSet(new ReadLockHeldInfo(0, null));
FSNamesystem.LOG.info(
"\tNumber of suppressed read-lock reports: {}" +
"\n\tLongest read-lock held interval: {}ms via {}",
numSuppressedWarnings, lockHeldInfo.getIntervalMs(),
lockHeldInfo.getStackTrace());
}
}
@ -255,6 +268,14 @@ class FSNamesystemLock {
LogAction logAction = LogThrottlingHelper.DO_NOT_LOG;
if (needReport &&
writeLockIntervalMs >= this.writeLockReportingThresholdMs) {
SummaryStatistics currentStats =
writeLockReportLogger.getCurrentStats("write", 0);
double currentMaxTime = currentStats != null ? currentStats.getMax() : 0;
if (currentMaxTime < writeLockIntervalMs) {
longestWriteLockHeldStackTrace =
StringUtils.getStackTrace(Thread.currentThread());
}
logAction = writeLockReportLogger
.record("write", currentTimeMs, writeLockIntervalMs);
}
@ -266,12 +287,12 @@ class FSNamesystemLock {
}
if (logAction.shouldLog()) {
FSNamesystem.LOG.info("FSNamesystem write lock held for {} ms via {}\t" +
"Number of suppressed write-lock reports: {}\n\tLongest write-lock " +
"held interval: {} \n\tTotal suppressed write-lock held time: {}",
writeLockIntervalMs,
StringUtils.getStackTrace(Thread.currentThread()),
FSNamesystem.LOG.info(
"\tNumber of suppressed write-lock reports: {}" +
"\n\tLongest write-lock held interval: {}ms via {} " +
"\n\tTotal suppressed write-lock held time: {}",
logAction.getCount() - 1, logAction.getStats(0).getMax(),
longestWriteLockHeldStackTrace,
logAction.getStats(0).getSum() - writeLockIntervalMs);
}
}
@ -323,4 +344,38 @@ class FSNamesystemLock {
org.apache.commons.lang3.StringUtils.capitalize(operationName) +
LOCK_METRIC_SUFFIX;
}
/**
* Read lock Held Info.
*/
private static class ReadLockHeldInfo {
/** Read lock held time. */
private Long intervalMs;
/** The stack trace read lock was held. */
private String stackTrace;
ReadLockHeldInfo(long intervalMs, String stackTrace) {
this.intervalMs = intervalMs;
this.stackTrace = stackTrace;
}
public Long getIntervalMs() {
return this.intervalMs;
}
public String getStackTrace() {
return this.stackTrace;
}
@Override
public int hashCode() {
return this.intervalMs.hashCode();
}
@Override
public boolean equals(Object obj) {
return obj instanceof ReadLockHeldInfo && ((ReadLockHeldInfo) obj)
.getIntervalMs().compareTo(intervalMs) == 0;
}
}
}

View File

@ -202,7 +202,11 @@ public class TestFSNamesystemLock {
timer.advance(writeLockReportingThreshold + 100);
logs.clearOutput();
fsnLock.writeUnlock();
// look for the method name in the stack trace
assertTrue(logs.getOutput().contains(GenericTestUtils.getMethodName()));
// find the held interval time in the log
Pattern pattern = Pattern.compile(".*[\n].*\\d+ms(.*[\n].*){1,}");
assertTrue(pattern.matcher(logs.getOutput()).find());
assertTrue(logs.getOutput().contains(
"Number of suppressed write-lock reports: 2"));
}
@ -215,7 +219,7 @@ public class TestFSNamesystemLock {
public void testFSReadLockLongHoldingReport() throws Exception {
final long readLockReportingThreshold = 100L;
final long readLockSuppressWarningInterval = 10000L;
final String readLockLogStmt = "FSNamesystem read lock held for ";
final String readLockLogStmt = "Number of suppressed read-lock reports";
Configuration conf = new Configuration();
conf.setLong(
DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY,
@ -256,6 +260,18 @@ public class TestFSNamesystemLock {
// Track but do not Report if it's held for a long time when re-entering
// read lock but time since last report does not exceed the suppress
// warning interval
Thread tLong = new Thread() {
@Override
public void run() {
fsnLock.readLock();
// Add one lock hold which is the longest, but occurs under a different
// stack trace, to ensure this is the one that gets logged
timer.advance(readLockReportingThreshold + 20);
fsnLock.readUnlock();
}
};
tLong.start();
tLong.join();
fsnLock.readLock();
timer.advance(readLockReportingThreshold / 2 + 1);
fsnLock.readLock();
@ -268,6 +284,18 @@ public class TestFSNamesystemLock {
fsnLock.readUnlock();
assertFalse(logs.getOutput().contains(GenericTestUtils.getMethodName()) &&
logs.getOutput().contains(readLockLogStmt));
timer.advance(readLockSuppressWarningInterval);
fsnLock.readLock();
timer.advance(readLockReportingThreshold + 1);
fsnLock.readUnlock();
// Assert that stack trace eventually logged is the one for the longest hold
String stackTracePatternString =
String.format("INFO.+%s(.+\n){5}\\Q%%s\\E\\.run", readLockLogStmt);
Pattern tLongPattern = Pattern.compile(
String.format(stackTracePatternString, tLong.getClass().getName()));
assertTrue(tLongPattern.matcher(logs.getOutput()).find());
assertTrue(logs.getOutput().contains(
"Number of suppressed read-lock reports: 3"));
// Report if it's held for a long time (and time since last report
// exceeds the suppress warning interval) while another thread also has the
@ -310,16 +338,15 @@ public class TestFSNamesystemLock {
t1.join();
t2.join();
// Look for the differentiating class names in the stack trace
String stackTracePatternString =
String.format("INFO.+%s(.+\n){5}\\Q%%s\\E\\.run", readLockLogStmt);
Pattern t1Pattern = Pattern.compile(
String.format(stackTracePatternString, t1.getClass().getName()));
assertTrue(t1Pattern.matcher(logs.getOutput()).find());
Pattern t2Pattern = Pattern.compile(
String.format(stackTracePatternString, t2.getClass().getName()));
assertFalse(t2Pattern.matcher(logs.getOutput()).find());
assertTrue(logs.getOutput().contains(
"Number of suppressed read-lock reports: 2"));
// match the held interval time in the log
Pattern pattern = Pattern.compile(".*[\n].*\\d+ms(.*[\n].*){1,}");
assertTrue(pattern.matcher(logs.getOutput()).find());
}
@Test
@ -396,7 +423,7 @@ public class TestFSNamesystemLock {
timer.advance(writeLockReportingThreshold + 100);
fsnLock.writeUnlock();
assertTrue(logs.getOutput().contains(
"FSNamesystem write lock held for"));
"Number of suppressed write-lock reports"));
logs.clearOutput();
@ -407,8 +434,6 @@ public class TestFSNamesystemLock {
assertFalse(logs.getOutput().contains(GenericTestUtils.getMethodName()));
assertFalse(logs.getOutput().contains(
"Number of suppressed write-lock reports:"));
assertFalse(logs.getOutput().contains(
"FSNamesystem write lock held for"));
}
}