diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java index d90e1d0ab5..be33c4b532 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java @@ -1,4 +1,3 @@ - /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file @@ -46,7 +45,11 @@ * {@link org.apache.hadoop.hdfs.DFSConfigKeys#DFS_NAMENODE_LOCK_DETAILED_METRICS_KEY} * to be true, metrics will be emitted into the FSNamesystem metrics registry * for each operation which acquires this lock indicating how long the operation - * held the lock for. Note that if a thread dies, metrics produced after the + * held the lock for. These metrics have names of the form + * FSN(Read|Write)LockNanosOperationName, where OperationName denotes the name + * of the operation that initiated the lock hold (this will be OTHER for certain + * uncategorized operations) and they export the hold time values in + * nanoseconds. Note that if a thread dies, metrics produced after the * most recent snapshot will be lost due to the use of * {@link MutableRatesWithAggregation}. However since threads are re-used * between operations this should not generally be an issue. @@ -63,24 +66,26 @@ class FSNamesystemLock { * Log statements about long lock hold times will not be produced more * frequently than this interval. */ - private final long lockSuppressWarningInterval; + private final long lockSuppressWarningIntervalMs; /** Threshold (ms) for long holding write lock report. */ - private final long writeLockReportingThreshold; + private final long writeLockReportingThresholdMs; /** Last time stamp for write lock. Keep the longest one for multi-entrance.*/ - private long writeLockHeldTimeStamp; + private long writeLockHeldTimeStampNanos; private int numWriteLockWarningsSuppressed = 0; - private long timeStampOfLastWriteLockReport = 0; - private long longestWriteLockHeldInterval = 0; + /** Time stamp (ms) of the last time a write lock report was written. */ + private long timeStampOfLastWriteLockReportMs = 0; + /** Longest time (ms) a write lock was held since the last report. */ + private long longestWriteLockHeldIntervalMs = 0; /** Threshold (ms) for long holding read lock report. */ - private final long readLockReportingThreshold; + private final long readLockReportingThresholdMs; /** * Last time stamp for read lock. Keep the longest one for * multi-entrance. This is ThreadLocal since there could be * many read locks held simultaneously. */ - private final ThreadLocal readLockHeldTimeStamp = + private final ThreadLocal readLockHeldTimeStampNanos = new ThreadLocal() { @Override public Long initialValue() { @@ -89,13 +94,16 @@ public Long initialValue() { }; private final AtomicInteger numReadLockWarningsSuppressed = new AtomicInteger(0); - private final AtomicLong timeStampOfLastReadLockReport = new AtomicLong(0); - private final AtomicLong longestReadLockHeldInterval = new AtomicLong(0); + /** Time stamp (ms) of the last time a read lock report was written. */ + private final AtomicLong timeStampOfLastReadLockReportMs = new AtomicLong(0); + /** Longest time (ms) a read lock was held since the last report. */ + private final AtomicLong longestReadLockHeldIntervalMs = new AtomicLong(0); @VisibleForTesting static final String OP_NAME_OTHER = "OTHER"; private static final String READ_LOCK_METRIC_PREFIX = "FSNReadLock"; private static final String WRITE_LOCK_METRIC_PREFIX = "FSNWriteLock"; + private static final String LOCK_METRIC_SUFFIX = "Nanos"; FSNamesystemLock(Configuration conf, MutableRatesWithAggregation detailedHoldTimeMetrics) { @@ -110,13 +118,13 @@ public Long initialValue() { this.coarseLock = new ReentrantReadWriteLock(fair); this.timer = timer; - this.writeLockReportingThreshold = conf.getLong( + this.writeLockReportingThresholdMs = conf.getLong( DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_KEY, DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_DEFAULT); - this.readLockReportingThreshold = conf.getLong( + this.readLockReportingThresholdMs = conf.getLong( DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY, DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_DEFAULT); - this.lockSuppressWarningInterval = conf.getTimeDuration( + this.lockSuppressWarningIntervalMs = conf.getTimeDuration( DFS_LOCK_SUPPRESS_WARNING_INTERVAL_KEY, DFS_LOCK_SUPPRESS_WARNING_INTERVAL_DEFAULT, TimeUnit.MILLISECONDS); this.metricsEnabled = conf.getBoolean( @@ -130,7 +138,7 @@ public Long initialValue() { public void readLock() { coarseLock.readLock().lock(); if (coarseLock.getReadHoldCount() == 1) { - readLockHeldTimeStamp.set(timer.monotonicNow()); + readLockHeldTimeStampNanos.set(timer.monotonicNowNanos()); } } @@ -140,56 +148,59 @@ public void readUnlock() { public void readUnlock(String opName) { final boolean needReport = coarseLock.getReadHoldCount() == 1; - final long readLockInterval = - timer.monotonicNow() - readLockHeldTimeStamp.get(); + final long readLockIntervalNanos = + timer.monotonicNowNanos() - readLockHeldTimeStampNanos.get(); coarseLock.readLock().unlock(); if (needReport) { - addMetric(opName, readLockInterval, false); - readLockHeldTimeStamp.remove(); + addMetric(opName, readLockIntervalNanos, false); + readLockHeldTimeStampNanos.remove(); } - if (needReport && readLockInterval >= this.readLockReportingThreshold) { + final long readLockIntervalMs = + TimeUnit.NANOSECONDS.toMillis(readLockIntervalNanos); + if (needReport && readLockIntervalMs >= this.readLockReportingThresholdMs) { long localLongestReadLock; do { - localLongestReadLock = longestReadLockHeldInterval.get(); - } while (localLongestReadLock - readLockInterval < 0 && - !longestReadLockHeldInterval.compareAndSet(localLongestReadLock, - readLockInterval)); + localLongestReadLock = longestReadLockHeldIntervalMs.get(); + } while (localLongestReadLock - readLockIntervalMs < 0 && + !longestReadLockHeldIntervalMs.compareAndSet(localLongestReadLock, + readLockIntervalMs)); long localTimeStampOfLastReadLockReport; - long now; + long nowMs; do { - now = timer.monotonicNow(); + nowMs = timer.monotonicNow(); localTimeStampOfLastReadLockReport = - timeStampOfLastReadLockReport.get(); - if (now - localTimeStampOfLastReadLockReport < - lockSuppressWarningInterval) { + timeStampOfLastReadLockReportMs.get(); + if (nowMs - localTimeStampOfLastReadLockReport < + lockSuppressWarningIntervalMs) { numReadLockWarningsSuppressed.incrementAndGet(); return; } - } while (!timeStampOfLastReadLockReport.compareAndSet( - localTimeStampOfLastReadLockReport, now)); + } while (!timeStampOfLastReadLockReportMs.compareAndSet( + localTimeStampOfLastReadLockReport, nowMs)); int numSuppressedWarnings = numReadLockWarningsSuppressed.getAndSet(0); - long longestLockHeldInterval = longestReadLockHeldInterval.getAndSet(0); + long longestLockHeldIntervalMs = + longestReadLockHeldIntervalMs.getAndSet(0); FSNamesystem.LOG.info("FSNamesystem read lock held for " + - readLockInterval + " ms via\n" + + readLockIntervalMs + " ms via\n" + StringUtils.getStackTrace(Thread.currentThread()) + "\tNumber of suppressed read-lock reports: " + numSuppressedWarnings + - "\n\tLongest read-lock held interval: " + longestLockHeldInterval); + "\n\tLongest read-lock held interval: " + longestLockHeldIntervalMs); } } public void writeLock() { coarseLock.writeLock().lock(); if (coarseLock.getWriteHoldCount() == 1) { - writeLockHeldTimeStamp = timer.monotonicNow(); + writeLockHeldTimeStampNanos = timer.monotonicNowNanos(); } } public void writeLockInterruptibly() throws InterruptedException { coarseLock.writeLock().lockInterruptibly(); if (coarseLock.getWriteHoldCount() == 1) { - writeLockHeldTimeStamp = timer.monotonicNow(); + writeLockHeldTimeStampNanos = timer.monotonicNowNanos(); } } @@ -200,24 +211,29 @@ public void writeUnlock() { public void writeUnlock(String opName) { final boolean needReport = coarseLock.getWriteHoldCount() == 1 && coarseLock.isWriteLockedByCurrentThread(); - final long currentTime = timer.monotonicNow(); - final long writeLockInterval = currentTime - writeLockHeldTimeStamp; + final long currentTimeNanos = timer.monotonicNowNanos(); + final long writeLockIntervalNanos = + currentTimeNanos - writeLockHeldTimeStampNanos; + final long currentTimeMs = TimeUnit.NANOSECONDS.toMillis(currentTimeNanos); + final long writeLockIntervalMs = + TimeUnit.NANOSECONDS.toMillis(writeLockIntervalNanos); boolean logReport = false; int numSuppressedWarnings = 0; - long longestLockHeldInterval = 0; - if (needReport && writeLockInterval >= this.writeLockReportingThreshold) { - if (writeLockInterval > longestWriteLockHeldInterval) { - longestWriteLockHeldInterval = writeLockInterval; + long longestLockHeldIntervalMs = 0; + if (needReport && + writeLockIntervalMs >= this.writeLockReportingThresholdMs) { + if (writeLockIntervalMs > longestWriteLockHeldIntervalMs) { + longestWriteLockHeldIntervalMs = writeLockIntervalMs; } - if (currentTime - timeStampOfLastWriteLockReport > - this.lockSuppressWarningInterval) { + if (currentTimeMs - timeStampOfLastWriteLockReportMs > + this.lockSuppressWarningIntervalMs) { logReport = true; numSuppressedWarnings = numWriteLockWarningsSuppressed; numWriteLockWarningsSuppressed = 0; - longestLockHeldInterval = longestWriteLockHeldInterval; - longestWriteLockHeldInterval = 0; - timeStampOfLastWriteLockReport = currentTime; + longestLockHeldIntervalMs = longestWriteLockHeldIntervalMs; + longestWriteLockHeldIntervalMs = 0; + timeStampOfLastWriteLockReportMs = currentTimeMs; } else { numWriteLockWarningsSuppressed++; } @@ -226,16 +242,16 @@ public void writeUnlock(String opName) { coarseLock.writeLock().unlock(); if (needReport) { - addMetric(opName, writeLockInterval, true); + addMetric(opName, writeLockIntervalNanos, true); } if (logReport) { FSNamesystem.LOG.info("FSNamesystem write lock held for " + - writeLockInterval + " ms via\n" + + writeLockIntervalMs + " ms via\n" + StringUtils.getStackTrace(Thread.currentThread()) + "\tNumber of suppressed write-lock reports: " + numSuppressedWarnings + "\n\tLongest write-lock held interval: " + - longestLockHeldInterval); + longestLockHeldIntervalMs); } } @@ -269,13 +285,14 @@ public int getQueueLength() { /** * Add the lock hold time for a recent operation to the metrics. * @param operationName Name of the operation for which to record the time - * @param value Length of time the lock was held + * @param value Length of time the lock was held (nanoseconds) */ private void addMetric(String operationName, long value, boolean isWrite) { if (metricsEnabled) { String metricName = (isWrite ? WRITE_LOCK_METRIC_PREFIX : READ_LOCK_METRIC_PREFIX) + - org.apache.commons.lang.StringUtils.capitalize(operationName); + org.apache.commons.lang.StringUtils.capitalize(operationName) + + LOCK_METRIC_SUFFIX; detailedHoldTimeMetrics.add(metricName, value); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml index 2b1495196f..5365cacd7c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml @@ -2740,7 +2740,11 @@ dfs.namenode.lock.detailed-metrics.enabled false If true, the namenode will keep track of how long various - operations hold the Namesystem lock for and emit this as metrics. + operations hold the Namesystem lock for and emit this as metrics. These + metrics have names of the form FSN(Read|Write)LockNanosOperationName, + where OperationName denotes the name of the operation that initiated the + lock hold (this will be OTHER for certain uncategorized operations) and + they export the hold time values in nanoseconds. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystemLock.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystemLock.java index c97e0f0ac2..1af8bf79e8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystemLock.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystemLock.java @@ -331,10 +331,10 @@ public void testDetailedHoldMetrics() throws Exception { FSNamesystemLock fsLock = new FSNamesystemLock(conf, rates, timer); fsLock.readLock(); - timer.advance(1); + timer.advanceNanos(1200000); fsLock.readUnlock("foo"); fsLock.readLock(); - timer.advance(2); + timer.advanceNanos(2400000); fsLock.readUnlock("foo"); fsLock.readLock(); @@ -351,12 +351,12 @@ public void testDetailedHoldMetrics() throws Exception { MetricsRecordBuilder rb = MetricsAsserts.mockMetricsRecordBuilder(); rates.snapshot(rb, true); - assertGauge("FSNReadLockFooAvgTime", 1.5, rb); - assertCounter("FSNReadLockFooNumOps", 2L, rb); - assertGauge("FSNReadLockBarAvgTime", 2.0, rb); - assertCounter("FSNReadLockBarNumOps", 1L, rb); - assertGauge("FSNWriteLockBazAvgTime", 1.0, rb); - assertCounter("FSNWriteLockBazNumOps", 1L, rb); + assertGauge("FSNReadLockFooNanosAvgTime", 1800000.0, rb); + assertCounter("FSNReadLockFooNanosNumOps", 2L, rb); + assertGauge("FSNReadLockBarNanosAvgTime", 2000000.0, rb); + assertCounter("FSNReadLockBarNanosNumOps", 1L, rb); + assertGauge("FSNWriteLockBazNanosAvgTime", 1000000.0, rb); + assertCounter("FSNWriteLockBazNanosNumOps", 1L, rb); } } \ No newline at end of file