From 9cb51bf106802c78b1400fba9f1d1c7e772dd5e7 Mon Sep 17 00:00:00 2001 From: tomscut Date: Mon, 1 Mar 2021 16:35:12 -0800 Subject: [PATCH] HDFS-15808. Add metrics for FSNamesystem read/write lock hold long time. (#2668) Contributed by tomscut. --- .../hdfs/server/namenode/FSNamesystem.java | 14 ++++++++ .../server/namenode/FSNamesystemLock.java | 34 +++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 22b4b92f44..ff03d7b052 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -4830,6 +4830,20 @@ public int getFsLockQueueLength() { return fsLock.getQueueLength(); } + @Metric(value = {"ReadLockLongHoldCount", "The number of time " + + "the read lock has been held for longer than the threshold"}, + type = Metric.Type.COUNTER) + public long getNumOfReadLockLongHold() { + return fsLock.getNumOfReadLockLongHold(); + } + + @Metric(value = {"WriteLockLongHoldCount", "The number of time " + + "the write lock has been held for longer than the threshold"}, + type = Metric.Type.COUNTER) + public long getNumOfWriteLockLongHold() { + return fsLock.getNumOfWriteLockLongHold(); + } + int getNumberOfDatanodes(DatanodeReportType type) { readLock(); try { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java index c03cfd5075..842c6b3f2d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java @@ -109,6 +109,16 @@ public Long initialValue() { private final AtomicReference longestReadLockHeldInfo = new AtomicReference<>(new LockHeldInfo()); private LockHeldInfo longestWriteLockHeldInfo = new LockHeldInfo(); + /** + * The number of time the read lock + * has been held longer than the threshold. + */ + private final AtomicLong numReadLockLongHold = new AtomicLong(0); + /** + * The number of time the write lock + * has been held for longer than the threshold. + */ + private final AtomicLong numWriteLockLongHold = new AtomicLong(0); @VisibleForTesting static final String OP_NAME_OTHER = "OTHER"; @@ -182,6 +192,7 @@ public void readUnlock(String opName, final long readLockIntervalMs = TimeUnit.NANOSECONDS.toMillis(readLockIntervalNanos); if (needReport && readLockIntervalMs >= this.readLockReportingThresholdMs) { + numReadLockLongHold.incrementAndGet(); String lockReportInfo = null; boolean done = false; while (!done) { @@ -298,6 +309,7 @@ private void writeUnlock(String opName, boolean suppressWriteLockReport, LogAction logAction = LogThrottlingHelper.DO_NOT_LOG; if (needReport && writeLockIntervalMs >= this.writeLockReportingThresholdMs) { + numWriteLockLongHold.incrementAndGet(); if (longestWriteLockHeldInfo.getIntervalMs() <= writeLockIntervalMs) { String lockReportInfo = lockReportInfoSupplier != null ? " (" + lockReportInfoSupplier.get() + ")" : ""; @@ -362,6 +374,28 @@ public int getQueueLength() { return coarseLock.getQueueLength(); } + /** + * Returns the number of time the read lock + * has been held longer than the threshold. + * + * @return long - Number of time the read lock + * has been held longer than the threshold + */ + public long getNumOfReadLockLongHold() { + return numReadLockLongHold.get(); + } + + /** + * Returns the number of time the write lock + * has been held longer than the threshold. + * + * @return long - Number of time the write lock + * has been held longer than the threshold. + */ + public long getNumOfWriteLockLongHold() { + return numWriteLockLongHold.get(); + } + /** * Add the lock hold time for a recent operation to the metrics. * @param operationName Name of the operation for which to record the time