From 94b3c6dd90593b78bb72a83bd83e6c0358f0c50f Mon Sep 17 00:00:00 2001 From: rdingankar Date: Mon, 27 Feb 2023 10:26:32 -0800 Subject: [PATCH] HDFS-16917 Add transfer rate quantile metrics for DataNode reads (#5397) Co-authored-by: Ravindra Dingankar --- .../hadoop-common/src/site/markdown/Metrics.md | 3 +++ .../java/org/apache/hadoop/hdfs/DFSUtil.java | 15 +++++++++++++++ .../hadoop/hdfs/server/datanode/DataXceiver.java | 3 +++ .../server/datanode/metrics/DataNodeMetrics.java | 14 ++++++++++++++ .../java/org/apache/hadoop/hdfs/TestDFSUtil.java | 16 ++++++++++++++++ .../server/datanode/TestDataNodeMetrics.java | 4 ++++ 6 files changed, 55 insertions(+) diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md index 9ad1cbd723..92b3ea452b 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md @@ -368,6 +368,9 @@ Each metrics record contains tags such as SessionId and Hostname as additional i |:---- |:---- | | `BytesWritten` | Total number of bytes written to DataNode | | `BytesRead` | Total number of bytes read from DataNode | +| `ReadTransferRateNumOps` | Total number of data read transfers | +| `ReadTransferRateAvgTime` | Average transfer rate of bytes read from DataNode, measured in bytes per second. | +| `ReadTransferRate`*num*`s(50/75/90/95/99)thPercentileRate` | The 50/75/90/95/99th percentile of the transfer rate of bytes read from DataNode, measured in bytes per second. | | `BlocksWritten` | Total number of blocks written to DataNode | | `BlocksRead` | Total number of blocks read from DataNode | | `BlocksReplicated` | Total number of blocks replicated | diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSUtil.java index 9e543859a8..8f5e05f62f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSUtil.java @@ -69,6 +69,7 @@ import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.ParentNotDirectoryException; import org.apache.hadoop.fs.UnresolvedLinkException; +import org.apache.hadoop.hdfs.server.datanode.metrics.DataNodeMetrics; import org.apache.hadoop.hdfs.server.namenode.FSDirectory; import org.apache.hadoop.hdfs.server.namenode.INodesInPath; import org.apache.hadoop.ipc.ProtobufRpcEngine; @@ -1893,4 +1894,18 @@ public static boolean isParentEntry(final String path, final String parent) { return path.charAt(parent.length()) == Path.SEPARATOR_CHAR || parent.equals(Path.SEPARATOR); } + + /** + * Add transfer rate metrics for valid data read and duration values. + * @param metrics metrics for datanodes + * @param read bytes read + * @param duration read duration + */ + public static void addTransferRateMetric(final DataNodeMetrics metrics, final long read, final long duration) { + if (read >= 0 && duration > 0) { + metrics.addReadTransferRate(read * 1000 / duration); + } else { + LOG.warn("Unexpected value for data transfer bytes={} duration={}", read, duration); + } + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataXceiver.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataXceiver.java index 300d84157b..00a40bafbf 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataXceiver.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataXceiver.java @@ -24,6 +24,7 @@ import org.apache.commons.logging.Log; import org.apache.hadoop.fs.FsTracer; import org.apache.hadoop.fs.StorageType; +import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DFSUtilClient; import org.apache.hadoop.hdfs.ExtendedBlockId; import org.apache.hadoop.hdfs.net.Peer; @@ -633,6 +634,7 @@ public void readBlock(final ExtendedBlock block, datanode.metrics.incrBytesRead((int) read); datanode.metrics.incrBlocksRead(); datanode.metrics.incrTotalReadTime(duration); + DFSUtil.addTransferRateMetric(datanode.metrics, read, duration); } catch ( SocketException ignored ) { LOG.trace("{}:Ignoring exception while serving {} to {}", dnR, block, remoteAddress, ignored); @@ -1122,6 +1124,7 @@ public void copyBlock(final ExtendedBlock block, datanode.metrics.incrBytesRead((int) read); datanode.metrics.incrBlocksRead(); datanode.metrics.incrTotalReadTime(duration); + DFSUtil.addTransferRateMetric(datanode.metrics, read, duration); LOG.info("Copied {} to {}", block, peer.getRemoteAddressString()); } catch (IOException ioe) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java index 6bccfc707b..d7eee80720 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java @@ -61,6 +61,8 @@ public class DataNodeMetrics { @Metric MutableCounterLong bytesRead; @Metric("Milliseconds spent reading") MutableCounterLong totalReadTime; + @Metric private MutableRate readTransferRate; + final private MutableQuantiles[] readTransferRateQuantiles; @Metric MutableCounterLong blocksWritten; @Metric MutableCounterLong blocksRead; @Metric MutableCounterLong blocksReplicated; @@ -201,6 +203,7 @@ public DataNodeMetrics(String name, String sessionId, int[] intervals, sendDataPacketTransferNanosQuantiles = new MutableQuantiles[len]; ramDiskBlocksEvictionWindowMsQuantiles = new MutableQuantiles[len]; ramDiskBlocksLazyPersistWindowMsQuantiles = new MutableQuantiles[len]; + readTransferRateQuantiles = new MutableQuantiles[len]; for (int i = 0; i < len; i++) { int interval = intervals[i]; @@ -229,6 +232,10 @@ public DataNodeMetrics(String name, String sessionId, int[] intervals, "ramDiskBlocksLazyPersistWindows" + interval + "s", "Time between the RamDisk block write and disk persist in ms", "ops", "latency", interval); + readTransferRateQuantiles[i] = registry.newQuantiles( + "readTransferRate" + interval + "s", + "Rate at which bytes are read from datanode calculated in bytes per second", + "ops", "rate", interval); } } @@ -290,6 +297,13 @@ public void addIncrementalBlockReport(long latency, } } + public void addReadTransferRate(long readTransferRate) { + this.readTransferRate.add(readTransferRate); + for (MutableQuantiles q : readTransferRateQuantiles) { + q.add(readTransferRate); + } + } + public void addCacheReport(long latency) { cacheReports.add(latency); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUtil.java index f1e69d0cf4..6957d7f756 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUtil.java @@ -45,6 +45,7 @@ import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import static org.mockito.Mockito.*; import java.io.File; import java.io.IOException; @@ -70,6 +71,7 @@ import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; +import org.apache.hadoop.hdfs.server.datanode.metrics.DataNodeMetrics; import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider; import org.apache.hadoop.http.HttpConfig; @@ -1109,4 +1111,18 @@ public void testErrorMessageForInvalidNameservice() throws Exception { LambdaTestUtils.intercept(IOException.class, expectedErrorMessage, ()->DFSUtil.getNNServiceRpcAddressesForCluster(conf)); } + + @Test + public void testAddTransferRateMetricForValidValues() { + DataNodeMetrics mockMetrics = mock(DataNodeMetrics.class); + DFSUtil.addTransferRateMetric(mockMetrics, 100, 10); + verify(mockMetrics).addReadTransferRate(10000); + } + + @Test + public void testAddTransferRateMetricForInvalidValue() { + DataNodeMetrics mockMetrics = mock(DataNodeMetrics.class); + DFSUtil.addTransferRateMetric(mockMetrics, 100, 0); + verify(mockMetrics, times(0)).addReadTransferRate(anyLong()); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMetrics.java index 85664ebc95..3a365ef82e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMetrics.java @@ -274,6 +274,7 @@ public void testTimeoutMetric() throws Exception { @Test(timeout=120000) public void testDataNodeTimeSpend() throws Exception { Configuration conf = new HdfsConfiguration(); + conf.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY, "" + 60); MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).build(); try { final FileSystem fs = cluster.getFileSystem(); @@ -285,6 +286,7 @@ public void testDataNodeTimeSpend() throws Exception { final long startWriteValue = getLongCounter("TotalWriteTime", rb); final long startReadValue = getLongCounter("TotalReadTime", rb); + assertCounter("ReadTransferRateNumOps", 0L, rb); final AtomicInteger x = new AtomicInteger(0); // Lets Metric system update latest metrics @@ -304,6 +306,8 @@ public Boolean get() { MetricsRecordBuilder rbNew = getMetrics(datanode.getMetrics().name()); final long endWriteValue = getLongCounter("TotalWriteTime", rbNew); final long endReadValue = getLongCounter("TotalReadTime", rbNew); + assertCounter("ReadTransferRateNumOps", 1L, rbNew); + assertQuantileGauges("ReadTransferRate" + "60s", rbNew, "Rate"); return endWriteValue > startWriteValue && endReadValue > startReadValue; }