HDFS-16315. Add metrics related to Transfer and NativeCopy for DataNode (#3666)

This commit is contained in:
litao 2021-11-17 10:06:53 +08:00 committed by GitHub
parent d0780e0601
commit 026d5860cb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 132 additions and 1 deletions

View File

@ -492,6 +492,12 @@ contains tags such as Hostname as additional information along with metrics.
| `WriteIoRateNumOps` | The number of file write io operations within an interval time of metric | | `WriteIoRateNumOps` | The number of file write io operations within an interval time of metric |
| `WriteIoRateAvgTime` | Mean time of file write io operations in milliseconds | | `WriteIoRateAvgTime` | Mean time of file write io operations in milliseconds |
| `WriteIoLatency`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of file write io operations latency in milliseconds (*num* seconds granularity). Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. | | `WriteIoLatency`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of file write io operations latency in milliseconds (*num* seconds granularity). Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
| `TransferIoRateNumOps` | The number of file transfer io operations within an interval time of metric |
| `TransferIoRateAvgTime` | Mean time of file transfer io operations in milliseconds |
| `TransferIoLatency`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of file transfer io operations latency in milliseconds (*num* seconds granularity). Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
| `NativeCopyIoRateNumOps` | The number of file nativeCopy io operations within an interval time of metric |
| `NativeCopyIoRateAvgTime` | Mean time of file nativeCopy io operations in milliseconds |
| `NativeCopyIoLatency`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of file nativeCopy io operations latency in milliseconds (*num* seconds granularity). Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
| `TotalFileIoErrors` | Total number (monotonically increasing) of file io error operations | | `TotalFileIoErrors` | Total number (monotonically increasing) of file io error operations |
| `FileIoErrorRateNumOps` | The number of file io error operations within an interval time of metric | | `FileIoErrorRateNumOps` | The number of file io error operations within an interval time of metric |
| `FileIoErrorRateAvgTime` | It measures the mean time in milliseconds from the start of an operation to hitting a failure | | `FileIoErrorRateAvgTime` | It measures the mean time in milliseconds from the start of an operation to hitting a failure |

View File

@ -116,6 +116,12 @@ public void afterFileIo(@Nullable FsVolumeSpi volume,
case WRITE: case WRITE:
metrics.addWriteIoLatency(latency); metrics.addWriteIoLatency(latency);
break; break;
case TRANSFER:
metrics.addTransferIoLatency(latency);
break;
case NATIVE_COPY:
metrics.addNativeCopyIoLatency(latency);
break;
default: default:
} }
} }

View File

@ -71,6 +71,14 @@ public class DataNodeVolumeMetrics {
private MutableRate writeIoRate; private MutableRate writeIoRate;
private MutableQuantiles[] writeIoLatencyQuantiles; private MutableQuantiles[] writeIoLatencyQuantiles;
@Metric("file io transfer rate")
private MutableRate transferIoRate;
private MutableQuantiles[] transferIoLatencyQuantiles;
@Metric("file io nativeCopy rate")
private MutableRate nativeCopyIoRate;
private MutableQuantiles[] nativeCopyIoLatencyQuantiles;
@Metric("number of file io errors") @Metric("number of file io errors")
private MutableCounterLong totalFileIoErrors; private MutableCounterLong totalFileIoErrors;
@Metric("file io error rate") @Metric("file io error rate")
@ -162,6 +170,40 @@ public double getWriteIoStdDev() {
return writeIoRate.lastStat().stddev(); return writeIoRate.lastStat().stddev();
} }
// Based on transferIoRate
public long getTransferIoSampleCount() {
return transferIoRate.lastStat().numSamples();
}
public double getTransferIoMean() {
return transferIoRate.lastStat().mean();
}
public double getTransferIoStdDev() {
return transferIoRate.lastStat().stddev();
}
public MutableQuantiles[] getTransferIoQuantiles() {
return transferIoLatencyQuantiles;
}
// Based on nativeCopyIoRate
public long getNativeCopyIoSampleCount() {
return nativeCopyIoRate.lastStat().numSamples();
}
public double getNativeCopyIoMean() {
return nativeCopyIoRate.lastStat().mean();
}
public double getNativeCopyIoStdDev() {
return nativeCopyIoRate.lastStat().stddev();
}
public MutableQuantiles[] getNativeCopyIoQuantiles() {
return nativeCopyIoLatencyQuantiles;
}
public long getTotalFileIoErrors() { public long getTotalFileIoErrors() {
return totalFileIoErrors.value(); return totalFileIoErrors.value();
} }
@ -193,6 +235,8 @@ public DataNodeVolumeMetrics(final MetricsSystem metricsSystem,
syncIoLatencyQuantiles = new MutableQuantiles[len]; syncIoLatencyQuantiles = new MutableQuantiles[len];
readIoLatencyQuantiles = new MutableQuantiles[len]; readIoLatencyQuantiles = new MutableQuantiles[len];
writeIoLatencyQuantiles = new MutableQuantiles[len]; writeIoLatencyQuantiles = new MutableQuantiles[len];
transferIoLatencyQuantiles = new MutableQuantiles[len];
nativeCopyIoLatencyQuantiles = new MutableQuantiles[len];
for (int i = 0; i < len; i++) { for (int i = 0; i < len; i++) {
int interval = intervals[i]; int interval = intervals[i];
metadataOperationLatencyQuantiles[i] = registry.newQuantiles( metadataOperationLatencyQuantiles[i] = registry.newQuantiles(
@ -213,6 +257,12 @@ public DataNodeVolumeMetrics(final MetricsSystem metricsSystem,
writeIoLatencyQuantiles[i] = registry.newQuantiles( writeIoLatencyQuantiles[i] = registry.newQuantiles(
"writeIoLatency" + interval + "s", "writeIoLatency" + interval + "s",
"Data write Io Latency in ms", "ops", "latency", interval); "Data write Io Latency in ms", "ops", "latency", interval);
transferIoLatencyQuantiles[i] = registry.newQuantiles(
"transferIoLatency" + interval + "s",
"Data transfer Io Latency in ms", "ops", "latency", interval);
nativeCopyIoLatencyQuantiles[i] = registry.newQuantiles(
"nativeCopyIoLatency" + interval + "s",
"Data nativeCopy Io Latency in ms", "ops", "latency", interval);
} }
} }
@ -282,6 +332,20 @@ public void addWriteIoLatency(final long latency) {
} }
} }
public void addTransferIoLatency(final long latency) {
transferIoRate.add(latency);
for (MutableQuantiles q: transferIoLatencyQuantiles) {
q.add(latency);
}
}
public void addNativeCopyIoLatency(final long latency) {
nativeCopyIoRate.add(latency);
for (MutableQuantiles q: nativeCopyIoLatencyQuantiles) {
q.add(latency);
}
}
public void addFileIoError(final long latency) { public void addFileIoError(final long latency) {
totalFileIoErrors.incr(); totalFileIoErrors.incr();
fileIoErrorRate.add(latency); fileIoErrorRate.add(latency);

View File

@ -151,7 +151,7 @@ private void verifyDataNodeVolumeMetrics(final FileSystem fs,
LOG.info("MetadataOperationSampleCount : " + LOG.info("MetadataOperationSampleCount : " +
metrics.getMetadataOperationSampleCount()); metrics.getMetadataOperationSampleCount());
LOG.info("MetadataOperationMean : " + metrics.getMetadataOperationMean()); LOG.info("MetadataOperationMean : " + metrics.getMetadataOperationMean());
LOG.info("MetadataFileIoStdDev : " + LOG.info("MetadataOperationStdDev : " +
metrics.getMetadataOperationStdDev()); metrics.getMetadataOperationStdDev());
LOG.info("DataFileIoSampleCount : " + metrics.getDataFileIoSampleCount()); LOG.info("DataFileIoSampleCount : " + metrics.getDataFileIoSampleCount());
@ -174,6 +174,15 @@ private void verifyDataNodeVolumeMetrics(final FileSystem fs,
LOG.info("writeIoMean : " + metrics.getWriteIoMean()); LOG.info("writeIoMean : " + metrics.getWriteIoMean());
LOG.info("writeIoStdDev : " + metrics.getWriteIoStdDev()); LOG.info("writeIoStdDev : " + metrics.getWriteIoStdDev());
LOG.info("transferIoSampleCount : " + metrics.getTransferIoSampleCount());
LOG.info("transferIoMean : " + metrics.getTransferIoMean());
LOG.info("transferIoStdDev : " + metrics.getTransferIoStdDev());
LOG.info("nativeCopyIoSampleCount : " +
metrics.getNativeCopyIoSampleCount());
LOG.info("nativeCopyIoMean : " + metrics.getNativeCopyIoMean());
LOG.info("nativeCopyIoStdDev : " + metrics.getNativeCopyIoStdDev());
LOG.info("fileIoErrorSampleCount : " LOG.info("fileIoErrorSampleCount : "
+ metrics.getFileIoErrorSampleCount()); + metrics.getFileIoErrorSampleCount());
LOG.info("fileIoErrorMean : " + metrics.getFileIoErrorMean()); LOG.info("fileIoErrorMean : " + metrics.getFileIoErrorMean());

View File

@ -61,6 +61,7 @@
import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo; import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo;
import org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry; import org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry;
import org.apache.hadoop.hdfs.server.datanode.StorageLocation; import org.apache.hadoop.hdfs.server.datanode.StorageLocation;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.DataNodeVolumeMetrics;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi.FsVolumeReferences; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi.FsVolumeReferences;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
@ -1257,4 +1258,49 @@ public void testReleaseVolumeRefIfExceptionThrown() throws IOException {
cluster.shutdown(); cluster.shutdown();
} }
} }
@Test(timeout = 30000)
public void testTransferAndNativeCopyMetrics() throws IOException {
Configuration config = new HdfsConfiguration();
config.setInt(
DFSConfigKeys.DFS_DATANODE_FILEIO_PROFILING_SAMPLING_PERCENTAGE_KEY,
100);
config.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY,
"60,300,1500");
try (MiniDFSCluster cluster = new MiniDFSCluster.Builder(config)
.numDataNodes(1)
.storageTypes(new StorageType[]{StorageType.DISK, StorageType.DISK})
.storagesPerDatanode(2)
.build()) {
FileSystem fs = cluster.getFileSystem();
DataNode dataNode = cluster.getDataNodes().get(0);
// Create file that has one block with one replica.
Path filePath = new Path("test");
DFSTestUtil.createFile(fs, filePath, 100, (short) 1, 0);
ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, filePath);
// Copy a new replica to other volume.
FsDatasetImpl fsDataSetImpl = (FsDatasetImpl) dataNode.getFSDataset();
ReplicaInfo newReplicaInfo = createNewReplicaObj(block, fsDataSetImpl);
fsDataSetImpl.finalizeNewReplica(newReplicaInfo, block);
// Get the volume where the original replica resides.
FsVolumeSpi volume = null;
for (FsVolumeSpi fsVolumeReference :
fsDataSetImpl.getFsVolumeReferences()) {
if (!fsVolumeReference.getStorageID()
.equals(newReplicaInfo.getStorageUuid())) {
volume = fsVolumeReference;
}
}
// Assert metrics.
DataNodeVolumeMetrics metrics = volume.getMetrics();
assertEquals(2, metrics.getTransferIoSampleCount());
assertEquals(3, metrics.getTransferIoQuantiles().length);
assertEquals(2, metrics.getNativeCopyIoSampleCount());
assertEquals(3, metrics.getNativeCopyIoQuantiles().length);
}
}
} }