From feb2664ac4b246ca87fc4997a941190f00026dff Mon Sep 17 00:00:00 2001 From: Yiqun Lin Date: Sat, 22 Dec 2018 23:09:59 +0800 Subject: [PATCH] HDFS-13946. Log longest FSN write/read lock held stack trace. --- .../hadoop/log/LogThrottlingHelper.java | 16 ++++ .../hadoop/log/TestLogThrottlingHelper.java | 3 + .../server/namenode/FSNamesystemLock.java | 93 +++++++++++++++---- .../server/namenode/TestFSNamesystemLock.java | 41 ++++++-- 4 files changed, 126 insertions(+), 27 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/log/LogThrottlingHelper.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/log/LogThrottlingHelper.java index 41bee0433b..848f123067 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/log/LogThrottlingHelper.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/log/LogThrottlingHelper.java @@ -272,6 +272,22 @@ public LogAction record(String recorderName, long currentTimeMs, } } + /** + * Return the summary information for given index. + * + * @param recorderName The name of the recorder. + * @param idx The index value. + * @return The summary information. + */ + public SummaryStatistics getCurrentStats(String recorderName, int idx) { + LoggingAction currentLog = currentLogs.get(recorderName); + if (currentLog != null) { + return currentLog.getStats(idx); + } + + return null; + } + /** * A standard log action which keeps track of all of the values which have * been logged. This is also used for internal bookkeeping via its private diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/log/TestLogThrottlingHelper.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/log/TestLogThrottlingHelper.java index a675d0a589..d0eeea3e51 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/log/TestLogThrottlingHelper.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/log/TestLogThrottlingHelper.java @@ -167,6 +167,9 @@ public void testMultipleLoggersWithValues() { assertEquals(2.0, bar.getStats(0).getMean(), 0.01); assertEquals(3.0, baz.getStats(0).getMean(), 0.01); assertEquals(3.0, baz.getStats(1).getMean(), 0.01); + + assertEquals(2.0, helper.getCurrentStats("bar", 0).getMax(), 0); + assertEquals(3.0, helper.getCurrentStats("baz", 0).getMax(), 0); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java index 7c28465f0d..ebf51781f4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java @@ -21,10 +21,12 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantReadWriteLock; import com.google.common.annotations.VisibleForTesting; +import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.log.LogThrottlingHelper; import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation; @@ -97,8 +99,18 @@ public Long initialValue() { new AtomicInteger(0); /** Time stamp (ms) of the last time a read lock report was written. */ private final AtomicLong timeStampOfLastReadLockReportMs = new AtomicLong(0); - /** Longest time (ms) a read lock was held since the last report. */ - private final AtomicLong longestReadLockHeldIntervalMs = new AtomicLong(0); + /** + * The info (lock held time and stack trace) when longest time (ms) a read + * lock was held since the last report. + */ + private final AtomicReference longestReadLockHeldInfo = + new AtomicReference<>(new ReadLockHeldInfo(0, null)); + + /** + * The stack trace when longest time of the write lock + * was held since the last report. + */ + private volatile String longestWriteLockHeldStackTrace; @VisibleForTesting static final String OP_NAME_OTHER = "OTHER"; @@ -172,12 +184,13 @@ public void readUnlock(String opName) { final long readLockIntervalMs = TimeUnit.NANOSECONDS.toMillis(readLockIntervalNanos); if (needReport && readLockIntervalMs >= this.readLockReportingThresholdMs) { - long localLongestReadLock; + ReadLockHeldInfo localLockHeldInfo; do { - localLongestReadLock = longestReadLockHeldIntervalMs.get(); - } while (localLongestReadLock - readLockIntervalMs < 0 && - !longestReadLockHeldIntervalMs.compareAndSet(localLongestReadLock, - readLockIntervalMs)); + localLockHeldInfo = longestReadLockHeldInfo.get(); + } while (localLockHeldInfo.getIntervalMs() - readLockIntervalMs < 0 && + !longestReadLockHeldInfo.compareAndSet(localLockHeldInfo, + new ReadLockHeldInfo(readLockIntervalMs, + StringUtils.getStackTrace(Thread.currentThread())))); long localTimeStampOfLastReadLockReport; long nowMs; @@ -193,13 +206,13 @@ public void readUnlock(String opName) { } while (!timeStampOfLastReadLockReportMs.compareAndSet( localTimeStampOfLastReadLockReport, nowMs)); int numSuppressedWarnings = numReadLockWarningsSuppressed.getAndSet(0); - long longestLockHeldIntervalMs = - longestReadLockHeldIntervalMs.getAndSet(0); - FSNamesystem.LOG.info("FSNamesystem read lock held for " + - readLockIntervalMs + " ms via\n" + - StringUtils.getStackTrace(Thread.currentThread()) + - "\tNumber of suppressed read-lock reports: " + numSuppressedWarnings + - "\n\tLongest read-lock held interval: " + longestLockHeldIntervalMs); + ReadLockHeldInfo lockHeldInfo = longestReadLockHeldInfo + .getAndSet(new ReadLockHeldInfo(0, null)); + FSNamesystem.LOG.info( + "\tNumber of suppressed read-lock reports: {}" + + "\n\tLongest read-lock held interval: {}ms via {}", + numSuppressedWarnings, lockHeldInfo.getIntervalMs(), + lockHeldInfo.getStackTrace()); } } @@ -255,6 +268,14 @@ public void writeUnlock(String opName, boolean suppressWriteLockReport) { LogAction logAction = LogThrottlingHelper.DO_NOT_LOG; if (needReport && writeLockIntervalMs >= this.writeLockReportingThresholdMs) { + SummaryStatistics currentStats = + writeLockReportLogger.getCurrentStats("write", 0); + double currentMaxTime = currentStats != null ? currentStats.getMax() : 0; + if (currentMaxTime < writeLockIntervalMs) { + longestWriteLockHeldStackTrace = + StringUtils.getStackTrace(Thread.currentThread()); + } + logAction = writeLockReportLogger .record("write", currentTimeMs, writeLockIntervalMs); } @@ -266,12 +287,12 @@ public void writeUnlock(String opName, boolean suppressWriteLockReport) { } if (logAction.shouldLog()) { - FSNamesystem.LOG.info("FSNamesystem write lock held for {} ms via {}\t" + - "Number of suppressed write-lock reports: {}\n\tLongest write-lock " + - "held interval: {} \n\tTotal suppressed write-lock held time: {}", - writeLockIntervalMs, - StringUtils.getStackTrace(Thread.currentThread()), + FSNamesystem.LOG.info( + "\tNumber of suppressed write-lock reports: {}" + + "\n\tLongest write-lock held interval: {}ms via {} " + + "\n\tTotal suppressed write-lock held time: {}", logAction.getCount() - 1, logAction.getStats(0).getMax(), + longestWriteLockHeldStackTrace, logAction.getStats(0).getSum() - writeLockIntervalMs); } } @@ -323,4 +344,38 @@ private static String getMetricName(String operationName, boolean isWrite) { org.apache.commons.lang3.StringUtils.capitalize(operationName) + LOCK_METRIC_SUFFIX; } + + /** + * Read lock Held Info. + */ + private static class ReadLockHeldInfo { + /** Read lock held time. */ + private Long intervalMs; + /** The stack trace read lock was held. */ + private String stackTrace; + + ReadLockHeldInfo(long intervalMs, String stackTrace) { + this.intervalMs = intervalMs; + this.stackTrace = stackTrace; + } + + public Long getIntervalMs() { + return this.intervalMs; + } + + public String getStackTrace() { + return this.stackTrace; + } + + @Override + public int hashCode() { + return this.intervalMs.hashCode(); + } + + @Override + public boolean equals(Object obj) { + return obj instanceof ReadLockHeldInfo && ((ReadLockHeldInfo) obj) + .getIntervalMs().compareTo(intervalMs) == 0; + } + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystemLock.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystemLock.java index 49506fe54d..0e418d188a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystemLock.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystemLock.java @@ -202,7 +202,11 @@ public void testFSWriteLockLongHoldingReport() throws Exception { timer.advance(writeLockReportingThreshold + 100); logs.clearOutput(); fsnLock.writeUnlock(); + // look for the method name in the stack trace assertTrue(logs.getOutput().contains(GenericTestUtils.getMethodName())); + // find the held interval time in the log + Pattern pattern = Pattern.compile(".*[\n].*\\d+ms(.*[\n].*){1,}"); + assertTrue(pattern.matcher(logs.getOutput()).find()); assertTrue(logs.getOutput().contains( "Number of suppressed write-lock reports: 2")); } @@ -215,7 +219,7 @@ public void testFSWriteLockLongHoldingReport() throws Exception { public void testFSReadLockLongHoldingReport() throws Exception { final long readLockReportingThreshold = 100L; final long readLockSuppressWarningInterval = 10000L; - final String readLockLogStmt = "FSNamesystem read lock held for "; + final String readLockLogStmt = "Number of suppressed read-lock reports"; Configuration conf = new Configuration(); conf.setLong( DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY, @@ -256,6 +260,18 @@ public void testFSReadLockLongHoldingReport() throws Exception { // Track but do not Report if it's held for a long time when re-entering // read lock but time since last report does not exceed the suppress // warning interval + Thread tLong = new Thread() { + @Override + public void run() { + fsnLock.readLock(); + // Add one lock hold which is the longest, but occurs under a different + // stack trace, to ensure this is the one that gets logged + timer.advance(readLockReportingThreshold + 20); + fsnLock.readUnlock(); + } + }; + tLong.start(); + tLong.join(); fsnLock.readLock(); timer.advance(readLockReportingThreshold / 2 + 1); fsnLock.readLock(); @@ -268,6 +284,18 @@ public void testFSReadLockLongHoldingReport() throws Exception { fsnLock.readUnlock(); assertFalse(logs.getOutput().contains(GenericTestUtils.getMethodName()) && logs.getOutput().contains(readLockLogStmt)); + timer.advance(readLockSuppressWarningInterval); + fsnLock.readLock(); + timer.advance(readLockReportingThreshold + 1); + fsnLock.readUnlock(); + // Assert that stack trace eventually logged is the one for the longest hold + String stackTracePatternString = + String.format("INFO.+%s(.+\n){5}\\Q%%s\\E\\.run", readLockLogStmt); + Pattern tLongPattern = Pattern.compile( + String.format(stackTracePatternString, tLong.getClass().getName())); + assertTrue(tLongPattern.matcher(logs.getOutput()).find()); + assertTrue(logs.getOutput().contains( + "Number of suppressed read-lock reports: 3")); // Report if it's held for a long time (and time since last report // exceeds the suppress warning interval) while another thread also has the @@ -310,16 +338,15 @@ public void run() { t1.join(); t2.join(); // Look for the differentiating class names in the stack trace - String stackTracePatternString = - String.format("INFO.+%s(.+\n){5}\\Q%%s\\E\\.run", readLockLogStmt); Pattern t1Pattern = Pattern.compile( String.format(stackTracePatternString, t1.getClass().getName())); assertTrue(t1Pattern.matcher(logs.getOutput()).find()); Pattern t2Pattern = Pattern.compile( String.format(stackTracePatternString, t2.getClass().getName())); assertFalse(t2Pattern.matcher(logs.getOutput()).find()); - assertTrue(logs.getOutput().contains( - "Number of suppressed read-lock reports: 2")); + // match the held interval time in the log + Pattern pattern = Pattern.compile(".*[\n].*\\d+ms(.*[\n].*){1,}"); + assertTrue(pattern.matcher(logs.getOutput()).find()); } @Test @@ -396,7 +423,7 @@ public void testFSWriteLockReportSuppressed() throws Exception { timer.advance(writeLockReportingThreshold + 100); fsnLock.writeUnlock(); assertTrue(logs.getOutput().contains( - "FSNamesystem write lock held for")); + "Number of suppressed write-lock reports")); logs.clearOutput(); @@ -407,8 +434,6 @@ public void testFSWriteLockReportSuppressed() throws Exception { assertFalse(logs.getOutput().contains(GenericTestUtils.getMethodName())); assertFalse(logs.getOutput().contains( "Number of suppressed write-lock reports:")); - assertFalse(logs.getOutput().contains( - "FSNamesystem write lock held for")); } } \ No newline at end of file