HDFS-16917 Add transfer rate quantile metrics for DataNode reads (#5397)
Co-authored-by: Ravindra Dingankar <rdingankar@linkedin.com>
This commit is contained in:
parent
61f369c43e
commit
0ca5686034
@ -370,6 +370,9 @@ Each metrics record contains tags such as SessionId and Hostname as additional i
|
|||||||
|:---- |:---- |
|
|:---- |:---- |
|
||||||
| `BytesWritten` | Total number of bytes written to DataNode |
|
| `BytesWritten` | Total number of bytes written to DataNode |
|
||||||
| `BytesRead` | Total number of bytes read from DataNode |
|
| `BytesRead` | Total number of bytes read from DataNode |
|
||||||
|
| `ReadTransferRateNumOps` | Total number of data read transfers |
|
||||||
|
| `ReadTransferRateAvgTime` | Average transfer rate of bytes read from DataNode, measured in bytes per second. |
|
||||||
|
| `ReadTransferRate`*num*`s(50/75/90/95/99)thPercentileRate` | The 50/75/90/95/99th percentile of the transfer rate of bytes read from DataNode, measured in bytes per second. |
|
||||||
| `BlocksWritten` | Total number of blocks written to DataNode |
|
| `BlocksWritten` | Total number of blocks written to DataNode |
|
||||||
| `BlocksRead` | Total number of blocks read from DataNode |
|
| `BlocksRead` | Total number of blocks read from DataNode |
|
||||||
| `BlocksReplicated` | Total number of blocks replicated |
|
| `BlocksReplicated` | Total number of blocks replicated |
|
||||||
|
@ -69,6 +69,7 @@
|
|||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.hadoop.fs.ParentNotDirectoryException;
|
import org.apache.hadoop.fs.ParentNotDirectoryException;
|
||||||
import org.apache.hadoop.fs.UnresolvedLinkException;
|
import org.apache.hadoop.fs.UnresolvedLinkException;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.metrics.DataNodeMetrics;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.FSDirectory;
|
import org.apache.hadoop.hdfs.server.namenode.FSDirectory;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.INodesInPath;
|
import org.apache.hadoop.hdfs.server.namenode.INodesInPath;
|
||||||
import org.apache.hadoop.ipc.ProtobufRpcEngine;
|
import org.apache.hadoop.ipc.ProtobufRpcEngine;
|
||||||
@ -1936,4 +1937,18 @@ public static boolean isParentEntry(final String path, final String parent) {
|
|||||||
return path.charAt(parent.length()) == Path.SEPARATOR_CHAR
|
return path.charAt(parent.length()) == Path.SEPARATOR_CHAR
|
||||||
|| parent.equals(Path.SEPARATOR);
|
|| parent.equals(Path.SEPARATOR);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add transfer rate metrics for valid data read and duration values.
|
||||||
|
* @param metrics metrics for datanodes
|
||||||
|
* @param read bytes read
|
||||||
|
* @param duration read duration
|
||||||
|
*/
|
||||||
|
public static void addTransferRateMetric(final DataNodeMetrics metrics, final long read, final long duration) {
|
||||||
|
if (read >= 0 && duration > 0) {
|
||||||
|
metrics.addReadTransferRate(read * 1000 / duration);
|
||||||
|
} else {
|
||||||
|
LOG.warn("Unexpected value for data transfer bytes={} duration={}", read, duration);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
package org.apache.hadoop.hdfs.server.datanode;
|
package org.apache.hadoop.hdfs.server.datanode;
|
||||||
|
|
||||||
import org.apache.hadoop.classification.VisibleForTesting;
|
import org.apache.hadoop.classification.VisibleForTesting;
|
||||||
|
import org.apache.hadoop.hdfs.DFSUtil;
|
||||||
import org.apache.hadoop.util.Preconditions;
|
import org.apache.hadoop.util.Preconditions;
|
||||||
import org.apache.hadoop.thirdparty.protobuf.ByteString;
|
import org.apache.hadoop.thirdparty.protobuf.ByteString;
|
||||||
import javax.crypto.SecretKey;
|
import javax.crypto.SecretKey;
|
||||||
@ -632,6 +633,7 @@ public void readBlock(final ExtendedBlock block,
|
|||||||
datanode.metrics.incrBytesRead((int) read);
|
datanode.metrics.incrBytesRead((int) read);
|
||||||
datanode.metrics.incrBlocksRead();
|
datanode.metrics.incrBlocksRead();
|
||||||
datanode.metrics.incrTotalReadTime(duration);
|
datanode.metrics.incrTotalReadTime(duration);
|
||||||
|
DFSUtil.addTransferRateMetric(datanode.metrics, read, duration);
|
||||||
} catch ( SocketException ignored ) {
|
} catch ( SocketException ignored ) {
|
||||||
LOG.trace("{}:Ignoring exception while serving {} to {}",
|
LOG.trace("{}:Ignoring exception while serving {} to {}",
|
||||||
dnR, block, remoteAddress, ignored);
|
dnR, block, remoteAddress, ignored);
|
||||||
@ -1122,6 +1124,7 @@ public void copyBlock(final ExtendedBlock block,
|
|||||||
datanode.metrics.incrBytesRead((int) read);
|
datanode.metrics.incrBytesRead((int) read);
|
||||||
datanode.metrics.incrBlocksRead();
|
datanode.metrics.incrBlocksRead();
|
||||||
datanode.metrics.incrTotalReadTime(duration);
|
datanode.metrics.incrTotalReadTime(duration);
|
||||||
|
DFSUtil.addTransferRateMetric(datanode.metrics, read, duration);
|
||||||
|
|
||||||
LOG.info("Copied {} to {}", block, peer.getRemoteAddressString());
|
LOG.info("Copied {} to {}", block, peer.getRemoteAddressString());
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
|
@ -61,6 +61,8 @@ public class DataNodeMetrics {
|
|||||||
@Metric MutableCounterLong bytesRead;
|
@Metric MutableCounterLong bytesRead;
|
||||||
@Metric("Milliseconds spent reading")
|
@Metric("Milliseconds spent reading")
|
||||||
MutableCounterLong totalReadTime;
|
MutableCounterLong totalReadTime;
|
||||||
|
@Metric private MutableRate readTransferRate;
|
||||||
|
final private MutableQuantiles[] readTransferRateQuantiles;
|
||||||
@Metric MutableCounterLong blocksWritten;
|
@Metric MutableCounterLong blocksWritten;
|
||||||
@Metric MutableCounterLong blocksRead;
|
@Metric MutableCounterLong blocksRead;
|
||||||
@Metric MutableCounterLong blocksReplicated;
|
@Metric MutableCounterLong blocksReplicated;
|
||||||
@ -227,6 +229,7 @@ public DataNodeMetrics(String name, String sessionId, int[] intervals,
|
|||||||
sendDataPacketTransferNanosQuantiles = new MutableQuantiles[len];
|
sendDataPacketTransferNanosQuantiles = new MutableQuantiles[len];
|
||||||
ramDiskBlocksEvictionWindowMsQuantiles = new MutableQuantiles[len];
|
ramDiskBlocksEvictionWindowMsQuantiles = new MutableQuantiles[len];
|
||||||
ramDiskBlocksLazyPersistWindowMsQuantiles = new MutableQuantiles[len];
|
ramDiskBlocksLazyPersistWindowMsQuantiles = new MutableQuantiles[len];
|
||||||
|
readTransferRateQuantiles = new MutableQuantiles[len];
|
||||||
|
|
||||||
for (int i = 0; i < len; i++) {
|
for (int i = 0; i < len; i++) {
|
||||||
int interval = intervals[i];
|
int interval = intervals[i];
|
||||||
@ -255,6 +258,10 @@ public DataNodeMetrics(String name, String sessionId, int[] intervals,
|
|||||||
"ramDiskBlocksLazyPersistWindows" + interval + "s",
|
"ramDiskBlocksLazyPersistWindows" + interval + "s",
|
||||||
"Time between the RamDisk block write and disk persist in ms",
|
"Time between the RamDisk block write and disk persist in ms",
|
||||||
"ops", "latency", interval);
|
"ops", "latency", interval);
|
||||||
|
readTransferRateQuantiles[i] = registry.newQuantiles(
|
||||||
|
"readTransferRate" + interval + "s",
|
||||||
|
"Rate at which bytes are read from datanode calculated in bytes per second",
|
||||||
|
"ops", "rate", interval);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -316,6 +323,13 @@ public void addIncrementalBlockReport(long latency,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void addReadTransferRate(long readTransferRate) {
|
||||||
|
this.readTransferRate.add(readTransferRate);
|
||||||
|
for (MutableQuantiles q : readTransferRateQuantiles) {
|
||||||
|
q.add(readTransferRate);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void addCacheReport(long latency) {
|
public void addCacheReport(long latency) {
|
||||||
cacheReports.add(latency);
|
cacheReports.add(latency);
|
||||||
}
|
}
|
||||||
|
@ -45,6 +45,7 @@
|
|||||||
import static org.junit.Assert.assertThat;
|
import static org.junit.Assert.assertThat;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
import static org.junit.Assert.fail;
|
import static org.junit.Assert.fail;
|
||||||
|
import static org.mockito.Mockito.*;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -71,6 +72,7 @@
|
|||||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||||
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
|
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
|
||||||
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
|
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.metrics.DataNodeMetrics;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider;
|
import org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider;
|
||||||
import org.apache.hadoop.http.HttpConfig;
|
import org.apache.hadoop.http.HttpConfig;
|
||||||
@ -1108,4 +1110,18 @@ public void testErrorMessageForInvalidNameservice() throws Exception {
|
|||||||
LambdaTestUtils.intercept(IOException.class, expectedErrorMessage,
|
LambdaTestUtils.intercept(IOException.class, expectedErrorMessage,
|
||||||
()->DFSUtil.getNNServiceRpcAddressesForCluster(conf));
|
()->DFSUtil.getNNServiceRpcAddressesForCluster(conf));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAddTransferRateMetricForValidValues() {
|
||||||
|
DataNodeMetrics mockMetrics = mock(DataNodeMetrics.class);
|
||||||
|
DFSUtil.addTransferRateMetric(mockMetrics, 100, 10);
|
||||||
|
verify(mockMetrics).addReadTransferRate(10000);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAddTransferRateMetricForInvalidValue() {
|
||||||
|
DataNodeMetrics mockMetrics = mock(DataNodeMetrics.class);
|
||||||
|
DFSUtil.addTransferRateMetric(mockMetrics, 100, 0);
|
||||||
|
verify(mockMetrics, times(0)).addReadTransferRate(anyLong());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -380,6 +380,7 @@ public void testTimeoutMetric() throws Exception {
|
|||||||
@Test(timeout=120000)
|
@Test(timeout=120000)
|
||||||
public void testDataNodeTimeSpend() throws Exception {
|
public void testDataNodeTimeSpend() throws Exception {
|
||||||
Configuration conf = new HdfsConfiguration();
|
Configuration conf = new HdfsConfiguration();
|
||||||
|
conf.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY, "" + 60);
|
||||||
MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).build();
|
MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).build();
|
||||||
try {
|
try {
|
||||||
final FileSystem fs = cluster.getFileSystem();
|
final FileSystem fs = cluster.getFileSystem();
|
||||||
@ -391,6 +392,7 @@ public void testDataNodeTimeSpend() throws Exception {
|
|||||||
|
|
||||||
final long startWriteValue = getLongCounter("TotalWriteTime", rb);
|
final long startWriteValue = getLongCounter("TotalWriteTime", rb);
|
||||||
final long startReadValue = getLongCounter("TotalReadTime", rb);
|
final long startReadValue = getLongCounter("TotalReadTime", rb);
|
||||||
|
assertCounter("ReadTransferRateNumOps", 0L, rb);
|
||||||
final AtomicInteger x = new AtomicInteger(0);
|
final AtomicInteger x = new AtomicInteger(0);
|
||||||
|
|
||||||
// Lets Metric system update latest metrics
|
// Lets Metric system update latest metrics
|
||||||
@ -410,6 +412,8 @@ public Boolean get() {
|
|||||||
MetricsRecordBuilder rbNew = getMetrics(datanode.getMetrics().name());
|
MetricsRecordBuilder rbNew = getMetrics(datanode.getMetrics().name());
|
||||||
final long endWriteValue = getLongCounter("TotalWriteTime", rbNew);
|
final long endWriteValue = getLongCounter("TotalWriteTime", rbNew);
|
||||||
final long endReadValue = getLongCounter("TotalReadTime", rbNew);
|
final long endReadValue = getLongCounter("TotalReadTime", rbNew);
|
||||||
|
assertCounter("ReadTransferRateNumOps", 1L, rbNew);
|
||||||
|
assertQuantileGauges("ReadTransferRate" + "60s", rbNew, "Rate");
|
||||||
return endWriteValue > startWriteValue
|
return endWriteValue > startWriteValue
|
||||||
&& endReadValue > startReadValue;
|
&& endReadValue > startReadValue;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user