From 97e244947719d483c3f80521a00fec8e13dcb637 Mon Sep 17 00:00:00 2001 From: Arpit Agarwal Date: Thu, 2 Jun 2016 13:14:45 -0700 Subject: [PATCH] HDFS-10341. Add a metric to expose the timeout number of pending replication blocks. (Contributed by Akira Ajisaka) --- .../src/site/markdown/Metrics.md | 1 + .../server/blockmanagement/BlockManager.java | 4 ++++ .../PendingReconstructionBlocks.java | 16 ++++++++++++++- .../hdfs/server/namenode/FSNamesystem.java | 5 +++++ .../TestPendingReconstruction.java | 20 +++++++++++++------ 5 files changed, 39 insertions(+), 7 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md index 699316f605..e4e2443caf 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md @@ -218,6 +218,7 @@ Each metrics record contains tags such as HAState and Hostname as additional inf | `TotalSyncCount` | Total number of sync operations performed by edit log | | `TotalSyncTimes` | Total number of milliseconds spent by various edit logs in sync operation| | `NameDirSize` | NameNode name directories size in bytes | +| `NumTimedOutPendingReconstructions` | The number of timed out reconstructions. Not the number of unique blocks that timed out. | JournalNode ----------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index ed57a86431..1a76e0926d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -198,6 +198,10 @@ public class BlockManager implements BlockStatsMXBean { public int getPendingDataNodeMessageCount() { return pendingDNMessages.count(); } + /** Used by metrics. */ + public long getNumTimedOutPendingReconstructions() { + return pendingReconstruction.getNumTimedOuts(); + } /**replicationRecheckInterval is how often namenode checks for new replication work*/ private final long replicationRecheckInterval; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReconstructionBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReconstructionBlocks.java index 528199c8cb..956e94f4df 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReconstructionBlocks.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReconstructionBlocks.java @@ -50,6 +50,7 @@ class PendingReconstructionBlocks { private final ArrayList timedOutItems; Daemon timerThread = null; private volatile boolean fsRunning = true; + private long timedOutCount = 0L; // // It might take anywhere between 5 to 10 minutes before @@ -125,6 +126,7 @@ class PendingReconstructionBlocks { synchronized (pendingReconstructions) { pendingReconstructions.clear(); timedOutItems.clear(); + timedOutCount = 0L; } } @@ -148,6 +150,16 @@ class PendingReconstructionBlocks { return 0; } + /** + * Used for metrics. + * @return The number of timeouts + */ + long getNumTimedOuts() { + synchronized (timedOutItems) { + return timedOutCount + timedOutItems.size(); + } + } + /** * Returns a list of blocks that have timed out their * reconstruction requests. Returns null if no blocks have @@ -158,9 +170,11 @@ class PendingReconstructionBlocks { if (timedOutItems.size() <= 0) { return null; } + int size = timedOutItems.size(); BlockInfo[] blockList = timedOutItems.toArray( - new BlockInfo[timedOutItems.size()]); + new BlockInfo[size]); timedOutItems.clear(); + timedOutCount += size; return blockList; } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index a2df6d22c1..c9f2487dfb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -4507,6 +4507,11 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, public long getExcessBlocks() { return blockManager.getExcessBlocksCount(); } + + @Metric + public long getNumTimedOutPendingReconstructions() { + return blockManager.getNumTimedOutPendingReconstructions(); + } // HA-only metric @Metric diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingReconstruction.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingReconstruction.java index d07c65763e..c30f63089f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingReconstruction.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingReconstruction.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hdfs.server.blockmanagement; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import java.util.ArrayList; @@ -117,14 +119,15 @@ public class TestPendingReconstruction { // // verify that nothing has timed out so far // - assertTrue(pendingReconstructions.getTimedOutBlocks() == null); + assertNull(pendingReconstructions.getTimedOutBlocks()); + assertEquals(0L, pendingReconstructions.getNumTimedOuts()); // // Wait for one second and then insert some more items. // try { Thread.sleep(1000); - } catch (Exception e) { + } catch (Exception ignored) { } for (int i = 10; i < 15; i++) { @@ -133,7 +136,8 @@ public class TestPendingReconstruction { DatanodeStorageInfo.toDatanodeDescriptors( DFSTestUtil.createDatanodeStorageInfos(i))); } - assertTrue(pendingReconstructions.size() == 15); + assertEquals(15, pendingReconstructions.size()); + assertEquals(0L, pendingReconstructions.getNumTimedOuts()); // // Wait for everything to timeout. @@ -153,10 +157,14 @@ public class TestPendingReconstruction { // Verify that everything has timed out. // assertEquals("Size of pendingReconstructions ", 0, pendingReconstructions.size()); + assertEquals(15L, pendingReconstructions.getNumTimedOuts()); Block[] timedOut = pendingReconstructions.getTimedOutBlocks(); - assertTrue(timedOut != null && timedOut.length == 15); - for (int i = 0; i < timedOut.length; i++) { - assertTrue(timedOut[i].getBlockId() < 15); + assertNotNull(timedOut); + assertEquals(15, timedOut.length); + // Verify the number is not reset + assertEquals(15L, pendingReconstructions.getNumTimedOuts()); + for (Block block : timedOut) { + assertTrue(block.getBlockId() < 15); } pendingReconstructions.stop(); }