From a87328dfab96a335535e8952e548534b73c00b7c Mon Sep 17 00:00:00 2001 From: Aaron Myers Date: Tue, 14 Feb 2012 08:27:15 +0000 Subject: [PATCH] HDFS-2943. Expose last checkpoint time and transaction stats as JMX metrics. Contributed by Aaron T. Myers. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1243822 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../hadoop/hdfs/server/namenode/FSImage.java | 6 +- .../hdfs/server/namenode/FSNamesystem.java | 25 +++++++ .../hdfs/server/namenode/NNStorage.java | 22 +++++- .../namenode/metrics/TestNameNodeMetrics.java | 67 ++++++++++++++++--- 5 files changed, 109 insertions(+), 14 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index d61b914cd2..f936c4612c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -221,6 +221,9 @@ Release 0.23.2 - UNRELEASED NEW FEATURES + HDFS-2943. Expose last checkpoint time and transaction stats as JMX + metrics. (atm) + IMPROVEMENTS HDFS-2931. Switch DataNode's BlockVolumeChoosingPolicy to private-audience. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java index eaa431bb8c..463fca5e0a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java @@ -711,7 +711,7 @@ private void loadFSImage(File curFile, MD5Hash expectedMd5, long txId = loader.getLoadedImageTxId(); LOG.info("Loaded image for txid " + txId + " from " + curFile); lastAppliedTxId = txId; - storage.setMostRecentCheckpointTxId(txId); + storage.setMostRecentCheckpointInfo(txId, curFile.lastModified()); } /** @@ -728,7 +728,7 @@ void saveFSImage(SaveNamespaceContext context, StorageDirectory sd) saver.save(newFile, compression); MD5FileUtils.saveMD5File(dstFile, saver.getSavedDigest()); - storage.setMostRecentCheckpointTxId(txid); + storage.setMostRecentCheckpointInfo(txid, Util.now()); } /** @@ -1032,7 +1032,7 @@ synchronized void saveDigestAndRenameCheckpointImage( // advertise it as such to other checkpointers // from now on if (txid > storage.getMostRecentCheckpointTxId()) { - storage.setMostRecentCheckpointTxId(txid); + storage.setMostRecentCheckpointInfo(txid, Util.now()); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index f58eff57c4..e70ed27c2f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -2680,6 +2680,31 @@ public long getMissingBlocksCount() { public int getExpiredHeartbeats() { return datanodeStatistics.getExpiredHeartbeats(); } + + @Metric({"TransactionsSinceLastCheckpoint", + "Number of transactions since last checkpoint"}) + public long getTransactionsSinceLastCheckpoint() { + return getEditLog().getLastWrittenTxId() - + getFSImage().getStorage().getMostRecentCheckpointTxId(); + } + + @Metric({"TransactionsSinceLastLogRoll", + "Number of transactions since last edit log roll"}) + public long getTransactionsSinceLastLogRoll() { + return (getEditLog().getLastWrittenTxId() - + getEditLog().getCurSegmentTxId()) + 1; + } + + @Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"}) + public long getLastWrittenTransactionId() { + return getEditLog().getLastWrittenTxId(); + } + + @Metric({"LastCheckpointTime", + "Time in milliseconds since the epoch of the last checkpoint"}) + public long getLastCheckpointTime() { + return getFSImage().getStorage().getMostRecentCheckpointTime(); + } /** @see ClientProtocol#getStats() */ long[] getStats() { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java index 4ced54447a..7bca8f4b31 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java @@ -127,6 +127,11 @@ public boolean isOfType(StorageDirType type) { * that have since been written to the edit log. */ protected long mostRecentCheckpointTxId = HdfsConstants.INVALID_TXID; + + /** + * Time of the last checkpoint, in milliseconds since the epoch. + */ + private long mostRecentCheckpointTime = 0; /** * list of failed (and thus removed) storages @@ -440,18 +445,29 @@ void writeTransactionIdFile(StorageDirectory sd, long txid) throws IOException { } /** - * Set the transaction ID of the last checkpoint + * Set the transaction ID and time of the last checkpoint + * + * @param txid transaction id of the last checkpoint + * @param time time of the last checkpoint, in millis since the epoch */ - void setMostRecentCheckpointTxId(long txid) { + void setMostRecentCheckpointInfo(long txid, long time) { this.mostRecentCheckpointTxId = txid; + this.mostRecentCheckpointTime = time; } /** - * Return the transaction ID of the last checkpoint. + * @return the transaction ID of the last checkpoint. */ long getMostRecentCheckpointTxId() { return mostRecentCheckpointTxId; } + + /** + * @return the time of the most recent checkpoint in millis since the epoch. + */ + long getMostRecentCheckpointTime() { + return mostRecentCheckpointTime; + } /** * Write a small file in all available storage directories that diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java index c8c528d0bb..014babd6d8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java @@ -20,13 +20,12 @@ import static org.apache.hadoop.test.MetricsAsserts.assertCounter; import static org.apache.hadoop.test.MetricsAsserts.assertGauge; import static org.apache.hadoop.test.MetricsAsserts.getMetrics; +import static org.junit.Assert.*; import java.io.DataInputStream; import java.io.IOException; import java.util.Random; -import junit.framework.TestCase; - import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.impl.Log4JLogger; import org.apache.hadoop.conf.Configuration; @@ -39,17 +38,21 @@ import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.protocol.LocatedBlock; +import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.test.MetricsAsserts; import org.apache.log4j.Level; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; /** * Test for metrics published by the Namenode */ -public class TestNameNodeMetrics extends TestCase { +public class TestNameNodeMetrics { private static final Configuration CONF = new HdfsConfiguration(); private static final int DFS_REPLICATION_INTERVAL = 1; private static final Path TEST_ROOT_DIR_PATH = @@ -81,8 +84,8 @@ private static Path getTestPath(String fileName) { return new Path(TEST_ROOT_DIR_PATH, fileName); } - @Override - protected void setUp() throws Exception { + @Before + public void setUp() throws Exception { cluster = new MiniDFSCluster.Builder(CONF).numDataNodes(DATANODE_COUNT).build(); cluster.waitActive(); namesystem = cluster.getNamesystem(); @@ -90,8 +93,8 @@ protected void setUp() throws Exception { fs = (DistributedFileSystem) cluster.getFileSystem(); } - @Override - protected void tearDown() throws Exception { + @After + public void tearDown() throws Exception { cluster.shutdown(); } @@ -115,6 +118,7 @@ private void readFile(FileSystem fileSys,Path name) throws IOException { } /** Test metrics associated with addition of a file */ + @Test public void testFileAdd() throws Exception { // Add files with 100 blocks final Path file = getTestPath("testFileAdd"); @@ -159,6 +163,7 @@ public void testFileAdd() throws Exception { } /** Corrupt a block and ensure metrics reflects it */ + @Test public void testCorruptBlock() throws Exception { // Create a file with single block with two replicas final Path file = getTestPath("testCorruptBlock"); @@ -184,6 +189,7 @@ public void testCorruptBlock() throws Exception { /** Create excess blocks by reducing the replication factor for * for a file and ensure metrics reflects it */ + @Test public void testExcessBlocks() throws Exception { Path file = getTestPath("testExcessBlocks"); createFile(file, 100, (short)2); @@ -196,6 +202,7 @@ public void testExcessBlocks() throws Exception { } /** Test to ensure metrics reflects missing blocks */ + @Test public void testMissingBlock() throws Exception { // Create a file with single block with two replicas Path file = getTestPath("testMissingBlocks"); @@ -220,6 +227,7 @@ private void waitForDeletion() throws InterruptedException { Thread.sleep(DFS_REPLICATION_INTERVAL * (DATANODE_COUNT + 1) * 1000); } + @Test public void testRenameMetrics() throws Exception { Path src = getTestPath("src"); createFile(src, 100, (short)1); @@ -244,7 +252,8 @@ public void testRenameMetrics() throws Exception { * * @throws IOException in case of an error */ - public void testGetBlockLocationMetric() throws Exception{ + @Test + public void testGetBlockLocationMetric() throws Exception { Path file1_Path = new Path(TEST_ROOT_DIR_PATH, "file1.dat"); // When cluster starts first time there are no file (read,create,open) @@ -272,4 +281,46 @@ public void testGetBlockLocationMetric() throws Exception{ updateMetrics(); assertCounter("GetBlockLocations", 3L, getMetrics(NN_METRICS)); } + + /** + * Test NN checkpoint and transaction-related metrics. + */ + @Test + public void testTransactionAndCheckpointMetrics() throws Exception { + long lastCkptTime = MetricsAsserts.getLongGauge("LastCheckpointTime", + getMetrics(NS_METRICS)); + + assertGauge("LastCheckpointTime", lastCkptTime, getMetrics(NS_METRICS)); + assertGauge("LastWrittenTransactionId", 1L, getMetrics(NS_METRICS)); + assertGauge("TransactionsSinceLastCheckpoint", 1L, getMetrics(NS_METRICS)); + assertGauge("TransactionsSinceLastLogRoll", 1L, getMetrics(NS_METRICS)); + + fs.mkdirs(new Path(TEST_ROOT_DIR_PATH, "/tmp")); + updateMetrics(); + + assertGauge("LastCheckpointTime", lastCkptTime, getMetrics(NS_METRICS)); + assertGauge("LastWrittenTransactionId", 2L, getMetrics(NS_METRICS)); + assertGauge("TransactionsSinceLastCheckpoint", 2L, getMetrics(NS_METRICS)); + assertGauge("TransactionsSinceLastLogRoll", 2L, getMetrics(NS_METRICS)); + + cluster.getNameNodeRpc().rollEditLog(); + updateMetrics(); + + assertGauge("LastCheckpointTime", lastCkptTime, getMetrics(NS_METRICS)); + assertGauge("LastWrittenTransactionId", 4L, getMetrics(NS_METRICS)); + assertGauge("TransactionsSinceLastCheckpoint", 4L, getMetrics(NS_METRICS)); + assertGauge("TransactionsSinceLastLogRoll", 1L, getMetrics(NS_METRICS)); + + cluster.getNameNodeRpc().setSafeMode(SafeModeAction.SAFEMODE_ENTER); + cluster.getNameNodeRpc().saveNamespace(); + cluster.getNameNodeRpc().setSafeMode(SafeModeAction.SAFEMODE_LEAVE); + updateMetrics(); + + long newLastCkptTime = MetricsAsserts.getLongGauge("LastCheckpointTime", + getMetrics(NS_METRICS)); + assertTrue(lastCkptTime < newLastCkptTime); + assertGauge("LastWrittenTransactionId", 6L, getMetrics(NS_METRICS)); + assertGauge("TransactionsSinceLastCheckpoint", 1L, getMetrics(NS_METRICS)); + assertGauge("TransactionsSinceLastLogRoll", 1L, getMetrics(NS_METRICS)); + } }