HDFS-16315. Add metrics related to Transfer and NativeCopy for DataNode (#3666)
This commit is contained in:
parent
d0780e0601
commit
026d5860cb
@ -492,6 +492,12 @@ contains tags such as Hostname as additional information along with metrics.
|
||||
| `WriteIoRateNumOps` | The number of file write io operations within an interval time of metric |
|
||||
| `WriteIoRateAvgTime` | Mean time of file write io operations in milliseconds |
|
||||
| `WriteIoLatency`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of file write io operations latency in milliseconds (*num* seconds granularity). Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
|
||||
| `TransferIoRateNumOps` | The number of file transfer io operations within an interval time of metric |
|
||||
| `TransferIoRateAvgTime` | Mean time of file transfer io operations in milliseconds |
|
||||
| `TransferIoLatency`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of file transfer io operations latency in milliseconds (*num* seconds granularity). Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
|
||||
| `NativeCopyIoRateNumOps` | The number of file nativeCopy io operations within an interval time of metric |
|
||||
| `NativeCopyIoRateAvgTime` | Mean time of file nativeCopy io operations in milliseconds |
|
||||
| `NativeCopyIoLatency`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of file nativeCopy io operations latency in milliseconds (*num* seconds granularity). Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
|
||||
| `TotalFileIoErrors` | Total number (monotonically increasing) of file io error operations |
|
||||
| `FileIoErrorRateNumOps` | The number of file io error operations within an interval time of metric |
|
||||
| `FileIoErrorRateAvgTime` | It measures the mean time in milliseconds from the start of an operation to hitting a failure |
|
||||
|
@ -116,6 +116,12 @@ public void afterFileIo(@Nullable FsVolumeSpi volume,
|
||||
case WRITE:
|
||||
metrics.addWriteIoLatency(latency);
|
||||
break;
|
||||
case TRANSFER:
|
||||
metrics.addTransferIoLatency(latency);
|
||||
break;
|
||||
case NATIVE_COPY:
|
||||
metrics.addNativeCopyIoLatency(latency);
|
||||
break;
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
@ -71,6 +71,14 @@ public class DataNodeVolumeMetrics {
|
||||
private MutableRate writeIoRate;
|
||||
private MutableQuantiles[] writeIoLatencyQuantiles;
|
||||
|
||||
@Metric("file io transfer rate")
|
||||
private MutableRate transferIoRate;
|
||||
private MutableQuantiles[] transferIoLatencyQuantiles;
|
||||
|
||||
@Metric("file io nativeCopy rate")
|
||||
private MutableRate nativeCopyIoRate;
|
||||
private MutableQuantiles[] nativeCopyIoLatencyQuantiles;
|
||||
|
||||
@Metric("number of file io errors")
|
||||
private MutableCounterLong totalFileIoErrors;
|
||||
@Metric("file io error rate")
|
||||
@ -162,6 +170,40 @@ public double getWriteIoStdDev() {
|
||||
return writeIoRate.lastStat().stddev();
|
||||
}
|
||||
|
||||
// Based on transferIoRate
|
||||
public long getTransferIoSampleCount() {
|
||||
return transferIoRate.lastStat().numSamples();
|
||||
}
|
||||
|
||||
public double getTransferIoMean() {
|
||||
return transferIoRate.lastStat().mean();
|
||||
}
|
||||
|
||||
public double getTransferIoStdDev() {
|
||||
return transferIoRate.lastStat().stddev();
|
||||
}
|
||||
|
||||
public MutableQuantiles[] getTransferIoQuantiles() {
|
||||
return transferIoLatencyQuantiles;
|
||||
}
|
||||
|
||||
// Based on nativeCopyIoRate
|
||||
public long getNativeCopyIoSampleCount() {
|
||||
return nativeCopyIoRate.lastStat().numSamples();
|
||||
}
|
||||
|
||||
public double getNativeCopyIoMean() {
|
||||
return nativeCopyIoRate.lastStat().mean();
|
||||
}
|
||||
|
||||
public double getNativeCopyIoStdDev() {
|
||||
return nativeCopyIoRate.lastStat().stddev();
|
||||
}
|
||||
|
||||
public MutableQuantiles[] getNativeCopyIoQuantiles() {
|
||||
return nativeCopyIoLatencyQuantiles;
|
||||
}
|
||||
|
||||
public long getTotalFileIoErrors() {
|
||||
return totalFileIoErrors.value();
|
||||
}
|
||||
@ -193,6 +235,8 @@ public DataNodeVolumeMetrics(final MetricsSystem metricsSystem,
|
||||
syncIoLatencyQuantiles = new MutableQuantiles[len];
|
||||
readIoLatencyQuantiles = new MutableQuantiles[len];
|
||||
writeIoLatencyQuantiles = new MutableQuantiles[len];
|
||||
transferIoLatencyQuantiles = new MutableQuantiles[len];
|
||||
nativeCopyIoLatencyQuantiles = new MutableQuantiles[len];
|
||||
for (int i = 0; i < len; i++) {
|
||||
int interval = intervals[i];
|
||||
metadataOperationLatencyQuantiles[i] = registry.newQuantiles(
|
||||
@ -213,6 +257,12 @@ public DataNodeVolumeMetrics(final MetricsSystem metricsSystem,
|
||||
writeIoLatencyQuantiles[i] = registry.newQuantiles(
|
||||
"writeIoLatency" + interval + "s",
|
||||
"Data write Io Latency in ms", "ops", "latency", interval);
|
||||
transferIoLatencyQuantiles[i] = registry.newQuantiles(
|
||||
"transferIoLatency" + interval + "s",
|
||||
"Data transfer Io Latency in ms", "ops", "latency", interval);
|
||||
nativeCopyIoLatencyQuantiles[i] = registry.newQuantiles(
|
||||
"nativeCopyIoLatency" + interval + "s",
|
||||
"Data nativeCopy Io Latency in ms", "ops", "latency", interval);
|
||||
}
|
||||
}
|
||||
|
||||
@ -282,6 +332,20 @@ public void addWriteIoLatency(final long latency) {
|
||||
}
|
||||
}
|
||||
|
||||
public void addTransferIoLatency(final long latency) {
|
||||
transferIoRate.add(latency);
|
||||
for (MutableQuantiles q: transferIoLatencyQuantiles) {
|
||||
q.add(latency);
|
||||
}
|
||||
}
|
||||
|
||||
public void addNativeCopyIoLatency(final long latency) {
|
||||
nativeCopyIoRate.add(latency);
|
||||
for (MutableQuantiles q: nativeCopyIoLatencyQuantiles) {
|
||||
q.add(latency);
|
||||
}
|
||||
}
|
||||
|
||||
public void addFileIoError(final long latency) {
|
||||
totalFileIoErrors.incr();
|
||||
fileIoErrorRate.add(latency);
|
||||
|
@ -151,7 +151,7 @@ private void verifyDataNodeVolumeMetrics(final FileSystem fs,
|
||||
LOG.info("MetadataOperationSampleCount : " +
|
||||
metrics.getMetadataOperationSampleCount());
|
||||
LOG.info("MetadataOperationMean : " + metrics.getMetadataOperationMean());
|
||||
LOG.info("MetadataFileIoStdDev : " +
|
||||
LOG.info("MetadataOperationStdDev : " +
|
||||
metrics.getMetadataOperationStdDev());
|
||||
|
||||
LOG.info("DataFileIoSampleCount : " + metrics.getDataFileIoSampleCount());
|
||||
@ -174,6 +174,15 @@ private void verifyDataNodeVolumeMetrics(final FileSystem fs,
|
||||
LOG.info("writeIoMean : " + metrics.getWriteIoMean());
|
||||
LOG.info("writeIoStdDev : " + metrics.getWriteIoStdDev());
|
||||
|
||||
LOG.info("transferIoSampleCount : " + metrics.getTransferIoSampleCount());
|
||||
LOG.info("transferIoMean : " + metrics.getTransferIoMean());
|
||||
LOG.info("transferIoStdDev : " + metrics.getTransferIoStdDev());
|
||||
|
||||
LOG.info("nativeCopyIoSampleCount : " +
|
||||
metrics.getNativeCopyIoSampleCount());
|
||||
LOG.info("nativeCopyIoMean : " + metrics.getNativeCopyIoMean());
|
||||
LOG.info("nativeCopyIoStdDev : " + metrics.getNativeCopyIoStdDev());
|
||||
|
||||
LOG.info("fileIoErrorSampleCount : "
|
||||
+ metrics.getFileIoErrorSampleCount());
|
||||
LOG.info("fileIoErrorMean : " + metrics.getFileIoErrorMean());
|
||||
|
@ -61,6 +61,7 @@
|
||||
import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo;
|
||||
import org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry;
|
||||
import org.apache.hadoop.hdfs.server.datanode.StorageLocation;
|
||||
import org.apache.hadoop.hdfs.server.datanode.fsdataset.DataNodeVolumeMetrics;
|
||||
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
|
||||
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi.FsVolumeReferences;
|
||||
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
|
||||
@ -1257,4 +1258,49 @@ public void testReleaseVolumeRefIfExceptionThrown() throws IOException {
|
||||
cluster.shutdown();
|
||||
}
|
||||
}
|
||||
|
||||
@Test(timeout = 30000)
|
||||
public void testTransferAndNativeCopyMetrics() throws IOException {
|
||||
Configuration config = new HdfsConfiguration();
|
||||
config.setInt(
|
||||
DFSConfigKeys.DFS_DATANODE_FILEIO_PROFILING_SAMPLING_PERCENTAGE_KEY,
|
||||
100);
|
||||
config.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY,
|
||||
"60,300,1500");
|
||||
try (MiniDFSCluster cluster = new MiniDFSCluster.Builder(config)
|
||||
.numDataNodes(1)
|
||||
.storageTypes(new StorageType[]{StorageType.DISK, StorageType.DISK})
|
||||
.storagesPerDatanode(2)
|
||||
.build()) {
|
||||
FileSystem fs = cluster.getFileSystem();
|
||||
DataNode dataNode = cluster.getDataNodes().get(0);
|
||||
|
||||
// Create file that has one block with one replica.
|
||||
Path filePath = new Path("test");
|
||||
DFSTestUtil.createFile(fs, filePath, 100, (short) 1, 0);
|
||||
ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, filePath);
|
||||
|
||||
// Copy a new replica to other volume.
|
||||
FsDatasetImpl fsDataSetImpl = (FsDatasetImpl) dataNode.getFSDataset();
|
||||
ReplicaInfo newReplicaInfo = createNewReplicaObj(block, fsDataSetImpl);
|
||||
fsDataSetImpl.finalizeNewReplica(newReplicaInfo, block);
|
||||
|
||||
// Get the volume where the original replica resides.
|
||||
FsVolumeSpi volume = null;
|
||||
for (FsVolumeSpi fsVolumeReference :
|
||||
fsDataSetImpl.getFsVolumeReferences()) {
|
||||
if (!fsVolumeReference.getStorageID()
|
||||
.equals(newReplicaInfo.getStorageUuid())) {
|
||||
volume = fsVolumeReference;
|
||||
}
|
||||
}
|
||||
|
||||
// Assert metrics.
|
||||
DataNodeVolumeMetrics metrics = volume.getMetrics();
|
||||
assertEquals(2, metrics.getTransferIoSampleCount());
|
||||
assertEquals(3, metrics.getTransferIoQuantiles().length);
|
||||
assertEquals(2, metrics.getNativeCopyIoSampleCount());
|
||||
assertEquals(3, metrics.getNativeCopyIoQuantiles().length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user