HDFS-10872. Add MutableRate metrics for FSNamesystemLock operations. Contributed by Erik Krogen.
This commit is contained in:
parent
6efb8c9c60
commit
ff0b99eafe
@ -281,6 +281,14 @@ public synchronized MutableRate newRate(String name, String desc,
|
||||
return ret;
|
||||
}
|
||||
|
||||
public synchronized MutableRatesWithAggregation newRatesWithAggregation(
|
||||
String name) {
|
||||
checkMetricName(name);
|
||||
MutableRatesWithAggregation rates = new MutableRatesWithAggregation();
|
||||
metricsMap.put(name, rates);
|
||||
return rates;
|
||||
}
|
||||
|
||||
synchronized void add(String name, MutableMetric metric) {
|
||||
checkMetricName(name);
|
||||
metricsMap.put(name, metric);
|
||||
|
@ -68,9 +68,7 @@ MutableMetric newForField(Field field, Metric annotation,
|
||||
return new MutableRates(registry);
|
||||
}
|
||||
if (cls == MutableRatesWithAggregation.class) {
|
||||
MutableRatesWithAggregation rates = new MutableRatesWithAggregation();
|
||||
registry.add(info.name(), rates);
|
||||
return rates;
|
||||
return registry.newRatesWithAggregation(info.name());
|
||||
}
|
||||
if (cls == MutableStat.class) {
|
||||
return registry.newStat(info.name(), info.description(),
|
||||
|
@ -415,6 +415,10 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
|
||||
public static final long
|
||||
DFS_NAMENODE_MAX_LOCK_HOLD_TO_RELEASE_LEASE_MS_DEFAULT = 25;
|
||||
|
||||
public static final String DFS_NAMENODE_LOCK_DETAILED_METRICS_KEY =
|
||||
"dfs.namenode.lock.detailed-metrics.enabled";
|
||||
public static final boolean DFS_NAMENODE_LOCK_DETAILED_METRICS_DEFAULT =
|
||||
false;
|
||||
// Threshold for how long namenode locks must be held for the
|
||||
// event to be logged
|
||||
public static final String DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_KEY =
|
||||
|
@ -959,7 +959,7 @@ public final void processCacheReport(final DatanodeID datanodeID,
|
||||
processCacheReportImpl(datanode, blockIds);
|
||||
} finally {
|
||||
endTime = Time.monotonicNow();
|
||||
namesystem.writeUnlock();
|
||||
namesystem.writeUnlock("processCacheReport");
|
||||
}
|
||||
|
||||
// Log the block report processing stats from Namenode perspective
|
||||
|
@ -264,7 +264,7 @@ void doCheckpoint() throws IOException {
|
||||
bnStorage.writeAll();
|
||||
}
|
||||
} finally {
|
||||
backupNode.namesystem.writeUnlock();
|
||||
backupNode.namesystem.writeUnlock("doCheckpoint");
|
||||
}
|
||||
|
||||
if(cpCmd.needToReturnImage()) {
|
||||
|
@ -115,7 +115,7 @@ public boolean yield() {
|
||||
|
||||
// unlock
|
||||
dir.readUnlock();
|
||||
fsn.readUnlock();
|
||||
fsn.readUnlock("contentSummary");
|
||||
|
||||
try {
|
||||
Thread.sleep(sleepMilliSec, sleepNanoSec);
|
||||
|
@ -155,7 +155,7 @@ long loadFSEdits(EditLogInputStream edits, long expectedStartingTxId,
|
||||
return numEdits;
|
||||
} finally {
|
||||
edits.close();
|
||||
fsNamesys.writeUnlock();
|
||||
fsNamesys.writeUnlock("loadFSEdits");
|
||||
prog.endStep(Phase.LOADING_EDITS, step);
|
||||
}
|
||||
}
|
||||
@ -292,7 +292,7 @@ long loadEditRecords(EditLogInputStream in, boolean closeOnExit,
|
||||
in.close();
|
||||
}
|
||||
fsDir.writeUnlock();
|
||||
fsNamesys.writeUnlock();
|
||||
fsNamesys.writeUnlock("loadEditRecords");
|
||||
|
||||
if (LOG.isTraceEnabled()) {
|
||||
LOG.trace("replaying edit log finished");
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -27,11 +27,14 @@
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.util.Timer;
|
||||
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_LOCK_SUPPRESS_WARNING_INTERVAL_DEFAULT;
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_LOCK_SUPPRESS_WARNING_INTERVAL_KEY;
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LOCK_DETAILED_METRICS_DEFAULT;
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LOCK_DETAILED_METRICS_KEY;
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_DEFAULT;
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY;
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_DEFAULT;
|
||||
@ -40,11 +43,20 @@
|
||||
/**
|
||||
* Mimics a ReentrantReadWriteLock but does not directly implement the interface
|
||||
* so more sophisticated locking capabilities and logging/metrics are possible.
|
||||
* {@link org.apache.hadoop.hdfs.DFSConfigKeys#DFS_NAMENODE_LOCK_DETAILED_METRICS_KEY}
|
||||
* to be true, metrics will be emitted into the FSNamesystem metrics registry
|
||||
* for each operation which acquires this lock indicating how long the operation
|
||||
* held the lock for. Note that if a thread dies, metrics produced after the
|
||||
* most recent snapshot will be lost due to the use of
|
||||
* {@link MutableRatesWithAggregation}. However since threads are re-used
|
||||
* between operations this should not generally be an issue.
|
||||
*/
|
||||
class FSNamesystemLock {
|
||||
@VisibleForTesting
|
||||
protected ReentrantReadWriteLock coarseLock;
|
||||
|
||||
private final boolean metricsEnabled;
|
||||
private final MutableRatesWithAggregation detailedHoldTimeMetrics;
|
||||
private final Timer timer;
|
||||
|
||||
/**
|
||||
@ -80,12 +92,19 @@ public Long initialValue() {
|
||||
private final AtomicLong timeStampOfLastReadLockReport = new AtomicLong(0);
|
||||
private final AtomicLong longestReadLockHeldInterval = new AtomicLong(0);
|
||||
|
||||
FSNamesystemLock(Configuration conf) {
|
||||
this(conf, new Timer());
|
||||
@VisibleForTesting
|
||||
static final String OP_NAME_OTHER = "OTHER";
|
||||
private static final String READ_LOCK_METRIC_PREFIX = "FSNReadLock";
|
||||
private static final String WRITE_LOCK_METRIC_PREFIX = "FSNWriteLock";
|
||||
|
||||
FSNamesystemLock(Configuration conf,
|
||||
MutableRatesWithAggregation detailedHoldTimeMetrics) {
|
||||
this(conf, detailedHoldTimeMetrics, new Timer());
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
FSNamesystemLock(Configuration conf, Timer timer) {
|
||||
FSNamesystemLock(Configuration conf,
|
||||
MutableRatesWithAggregation detailedHoldTimeMetrics, Timer timer) {
|
||||
boolean fair = conf.getBoolean("dfs.namenode.fslock.fair", true);
|
||||
FSNamesystem.LOG.info("fsLock is fair: " + fair);
|
||||
this.coarseLock = new ReentrantReadWriteLock(fair);
|
||||
@ -100,6 +119,12 @@ public Long initialValue() {
|
||||
this.lockSuppressWarningInterval = conf.getTimeDuration(
|
||||
DFS_LOCK_SUPPRESS_WARNING_INTERVAL_KEY,
|
||||
DFS_LOCK_SUPPRESS_WARNING_INTERVAL_DEFAULT, TimeUnit.MILLISECONDS);
|
||||
this.metricsEnabled = conf.getBoolean(
|
||||
DFS_NAMENODE_LOCK_DETAILED_METRICS_KEY,
|
||||
DFS_NAMENODE_LOCK_DETAILED_METRICS_DEFAULT);
|
||||
FSNamesystem.LOG.info("Detailed lock hold time metrics enabled: " +
|
||||
this.metricsEnabled);
|
||||
this.detailedHoldTimeMetrics = detailedHoldTimeMetrics;
|
||||
}
|
||||
|
||||
public void readLock() {
|
||||
@ -110,12 +135,17 @@ public void readLock() {
|
||||
}
|
||||
|
||||
public void readUnlock() {
|
||||
readUnlock(OP_NAME_OTHER);
|
||||
}
|
||||
|
||||
public void readUnlock(String opName) {
|
||||
final boolean needReport = coarseLock.getReadHoldCount() == 1;
|
||||
final long readLockInterval =
|
||||
timer.monotonicNow() - readLockHeldTimeStamp.get();
|
||||
coarseLock.readLock().unlock();
|
||||
|
||||
if (needReport) {
|
||||
addMetric(opName, readLockInterval, false);
|
||||
readLockHeldTimeStamp.remove();
|
||||
}
|
||||
if (needReport && readLockInterval >= this.readLockReportingThreshold) {
|
||||
@ -164,6 +194,10 @@ public void writeLockInterruptibly() throws InterruptedException {
|
||||
}
|
||||
|
||||
public void writeUnlock() {
|
||||
writeUnlock(OP_NAME_OTHER);
|
||||
}
|
||||
|
||||
public void writeUnlock(String opName) {
|
||||
final boolean needReport = coarseLock.getWriteHoldCount() == 1 &&
|
||||
coarseLock.isWriteLockedByCurrentThread();
|
||||
final long currentTime = timer.monotonicNow();
|
||||
@ -191,6 +225,10 @@ public void writeUnlock() {
|
||||
|
||||
coarseLock.writeLock().unlock();
|
||||
|
||||
if (needReport) {
|
||||
addMetric(opName, writeLockInterval, true);
|
||||
}
|
||||
|
||||
if (logReport) {
|
||||
FSNamesystem.LOG.info("FSNamesystem write lock held for " +
|
||||
writeLockInterval + " ms via\n" +
|
||||
@ -227,4 +265,19 @@ public Condition newWriteLockCondition() {
|
||||
public int getQueueLength() {
|
||||
return coarseLock.getQueueLength();
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the lock hold time for a recent operation to the metrics.
|
||||
* @param operationName Name of the operation for which to record the time
|
||||
* @param value Length of time the lock was held
|
||||
*/
|
||||
private void addMetric(String operationName, long value, boolean isWrite) {
|
||||
if (metricsEnabled) {
|
||||
String metricName =
|
||||
(isWrite ? WRITE_LOCK_METRIC_PREFIX : READ_LOCK_METRIC_PREFIX) +
|
||||
org.apache.commons.lang.StringUtils.capitalize(operationName);
|
||||
detailedHoldTimeMetrics.add(metricName, value);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -329,7 +329,7 @@ public void run() {
|
||||
needSync = checkLeases();
|
||||
}
|
||||
} finally {
|
||||
fsnamesystem.writeUnlock();
|
||||
fsnamesystem.writeUnlock("leaseManager");
|
||||
// lease reassignments should to be sync'ed.
|
||||
if (needSync) {
|
||||
fsnamesystem.getEditLog().logSync();
|
||||
|
@ -522,7 +522,7 @@ private LocatedBlocks getBlockLocations(String path, HdfsFileStatus file)
|
||||
} catch (FileNotFoundException fnfe) {
|
||||
blocks = null;
|
||||
} finally {
|
||||
fsn.readUnlock();
|
||||
fsn.readUnlock("fsckGetBlockLocations");
|
||||
}
|
||||
return blocks;
|
||||
}
|
||||
|
@ -2678,6 +2678,14 @@
|
||||
</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>dfs.namenode.lock.detailed-metrics.enabled</name>
|
||||
<value>false</value>
|
||||
<description>If true, the namenode will keep track of how long various
|
||||
operations hold the Namesystem lock for and emit this as metrics.
|
||||
</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>dfs.namenode.startup.delay.block.deletion.sec</name>
|
||||
<value>0</value>
|
||||
|
@ -21,8 +21,12 @@
|
||||
import com.google.common.base.Supplier;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
|
||||
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
|
||||
import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.apache.hadoop.test.GenericTestUtils.LogCapturer;
|
||||
import org.apache.hadoop.test.MetricsAsserts;
|
||||
import org.apache.hadoop.util.FakeTimer;
|
||||
import org.apache.log4j.Level;
|
||||
import org.junit.Test;
|
||||
@ -36,6 +40,8 @@
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
|
||||
import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
|
||||
|
||||
/**
|
||||
* Tests the FSNamesystemLock, looking at lock compatibilities and
|
||||
@ -48,17 +54,17 @@ public void testFsLockFairness() throws IOException, InterruptedException{
|
||||
Configuration conf = new Configuration();
|
||||
|
||||
conf.setBoolean("dfs.namenode.fslock.fair", true);
|
||||
FSNamesystemLock fsnLock = new FSNamesystemLock(conf);
|
||||
FSNamesystemLock fsnLock = new FSNamesystemLock(conf, null);
|
||||
assertTrue(fsnLock.coarseLock.isFair());
|
||||
|
||||
conf.setBoolean("dfs.namenode.fslock.fair", false);
|
||||
fsnLock = new FSNamesystemLock(conf);
|
||||
fsnLock = new FSNamesystemLock(conf, null);
|
||||
assertFalse(fsnLock.coarseLock.isFair());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFSNamesystemLockCompatibility() {
|
||||
FSNamesystemLock rwLock = new FSNamesystemLock(new Configuration());
|
||||
FSNamesystemLock rwLock = new FSNamesystemLock(new Configuration(), null);
|
||||
|
||||
assertEquals(0, rwLock.getReadHoldCount());
|
||||
rwLock.readLock();
|
||||
@ -98,7 +104,7 @@ public void testFSLockGetWaiterCount() throws InterruptedException {
|
||||
final CountDownLatch latch = new CountDownLatch(threadCount);
|
||||
final Configuration conf = new Configuration();
|
||||
conf.setBoolean("dfs.namenode.fslock.fair", true);
|
||||
final FSNamesystemLock rwLock = new FSNamesystemLock(conf);
|
||||
final FSNamesystemLock rwLock = new FSNamesystemLock(conf, null);
|
||||
rwLock.writeLock();
|
||||
ExecutorService helper = Executors.newFixedThreadPool(threadCount);
|
||||
|
||||
@ -141,7 +147,7 @@ public void testFSWriteLockLongHoldingReport() throws Exception {
|
||||
writeLockSuppressWarningInterval, TimeUnit.MILLISECONDS);
|
||||
|
||||
final FakeTimer timer = new FakeTimer();
|
||||
final FSNamesystemLock fsnLock = new FSNamesystemLock(conf, timer);
|
||||
final FSNamesystemLock fsnLock = new FSNamesystemLock(conf, null, timer);
|
||||
timer.advance(writeLockSuppressWarningInterval);
|
||||
|
||||
LogCapturer logs = LogCapturer.captureLogs(FSNamesystem.LOG);
|
||||
@ -216,7 +222,7 @@ public void testFSReadLockLongHoldingReport() throws Exception {
|
||||
readLockSuppressWarningInterval, TimeUnit.MILLISECONDS);
|
||||
|
||||
final FakeTimer timer = new FakeTimer();
|
||||
final FSNamesystemLock fsnLock = new FSNamesystemLock(conf, timer);
|
||||
final FSNamesystemLock fsnLock = new FSNamesystemLock(conf, null, timer);
|
||||
timer.advance(readLockSuppressWarningInterval);
|
||||
|
||||
LogCapturer logs = LogCapturer.captureLogs(FSNamesystem.LOG);
|
||||
@ -303,7 +309,7 @@ public void run() {
|
||||
t2.join();
|
||||
// Look for the differentiating class names in the stack trace
|
||||
String stackTracePatternString =
|
||||
String.format("INFO.+%s(.+\n){4}\\Q%%s\\E\\.run", readLockLogStmt);
|
||||
String.format("INFO.+%s(.+\n){5}\\Q%%s\\E\\.run", readLockLogStmt);
|
||||
Pattern t1Pattern = Pattern.compile(
|
||||
String.format(stackTracePatternString, t1.getClass().getName()));
|
||||
assertTrue(t1Pattern.matcher(logs.getOutput()).find());
|
||||
@ -314,4 +320,43 @@ public void run() {
|
||||
"Number of suppressed read-lock reports: 2"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDetailedHoldMetrics() throws Exception {
|
||||
Configuration conf = new Configuration();
|
||||
conf.setBoolean(DFSConfigKeys.DFS_NAMENODE_LOCK_DETAILED_METRICS_KEY, true);
|
||||
FakeTimer timer = new FakeTimer();
|
||||
MetricsRegistry registry = new MetricsRegistry("Test");
|
||||
MutableRatesWithAggregation rates =
|
||||
registry.newRatesWithAggregation("Test");
|
||||
FSNamesystemLock fsLock = new FSNamesystemLock(conf, rates, timer);
|
||||
|
||||
fsLock.readLock();
|
||||
timer.advance(1);
|
||||
fsLock.readUnlock("foo");
|
||||
fsLock.readLock();
|
||||
timer.advance(2);
|
||||
fsLock.readUnlock("foo");
|
||||
|
||||
fsLock.readLock();
|
||||
timer.advance(1);
|
||||
fsLock.readLock();
|
||||
timer.advance(1);
|
||||
fsLock.readUnlock("bar");
|
||||
fsLock.readUnlock("bar");
|
||||
|
||||
fsLock.writeLock();
|
||||
timer.advance(1);
|
||||
fsLock.writeUnlock("baz");
|
||||
|
||||
MetricsRecordBuilder rb = MetricsAsserts.mockMetricsRecordBuilder();
|
||||
rates.snapshot(rb, true);
|
||||
|
||||
assertGauge("FSNReadLockFooAvgTime", 1.5, rb);
|
||||
assertCounter("FSNReadLockFooNumOps", 2L, rb);
|
||||
assertGauge("FSNReadLockBarAvgTime", 2.0, rb);
|
||||
assertCounter("FSNReadLockBarNumOps", 1L, rb);
|
||||
assertGauge("FSNWriteLockBazAvgTime", 1.0, rb);
|
||||
assertCounter("FSNWriteLockBazNumOps", 1L, rb);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user