HDFS-15158. The number of failed volumes mismatch with volumeFailures of Datanode metrics. Contributed by Yang Yun.

This commit is contained in:
Ayush Saxena 2020-02-09 23:19:40 +05:30
parent 28f730b317
commit 6191d4b4a0
3 changed files with 35 additions and 9 deletions

View File

@ -2215,7 +2215,7 @@ public class DataNode extends ReconfigurableBase
});
}
private void handleDiskError(String failedVolumes) {
private void handleDiskError(String failedVolumes, int failedNumber) {
final boolean hasEnoughResources = data.hasEnoughResource();
LOG.warn("DataNode.handleDiskError on: " +
"[{}] Keep Running: {}", failedVolumes, hasEnoughResources);
@ -2224,7 +2224,7 @@ public class DataNode extends ReconfigurableBase
// shutdown the DN completely.
int dpError = hasEnoughResources ? DatanodeProtocol.DISK_ERROR
: DatanodeProtocol.FATAL_DISK_ERROR;
metrics.incrVolumeFailures();
metrics.incrVolumeFailures(failedNumber);
//inform NameNodes
for(BPOfferService bpos: blockPoolManager.getAllNamenodeThreads()) {
@ -3452,8 +3452,8 @@ public class DataNode extends ReconfigurableBase
}
data.handleVolumeFailures(unhealthyVolumes);
Set<StorageLocation> unhealthyLocations = new HashSet<>(
unhealthyVolumes.size());
int failedNumber = unhealthyVolumes.size();
Set<StorageLocation> unhealthyLocations = new HashSet<>(failedNumber);
StringBuilder sb = new StringBuilder("DataNode failed volumes:");
for (FsVolumeSpi vol : unhealthyVolumes) {
@ -3469,7 +3469,7 @@ public class DataNode extends ReconfigurableBase
}
LOG.debug("{}", sb);
// send blockreport regarding volume failure
handleDiskError(sb.toString());
handleDiskError(sb.toString(), failedNumber);
}
/**

View File

@ -400,8 +400,8 @@ public class DataNodeMetrics {
}
}
public void incrVolumeFailures() {
volumeFailures.incr();
public void incrVolumeFailures(int size) {
volumeFailures.incr(size);
}
public void incrDatanodeNetworkErrors() {

View File

@ -17,6 +17,8 @@
*/
package org.apache.hadoop.hdfs.server.datanode;
import static org.apache.hadoop.test.MetricsAsserts.getLongCounter;
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
import static org.apache.hadoop.test.PlatformAssumptions.assumeNotWindows;
import static org.hamcrest.core.Is.is;
import static org.junit.Assert.assertEquals;
@ -77,6 +79,7 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.test.GenericTestUtils;
@ -947,4 +950,27 @@ public class TestDataNodeVolumeFailure {
}
}, 10, 30 * 1000);
}
/*
* Fail two volumes, and check the metrics of VolumeFailures
*/
@Test
public void testVolumeFailureTwo() throws Exception {
// fail two volumes
data_fail = cluster.getInstanceStorageDir(1, 0);
failedDir = MiniDFSCluster.getFinalizedDir(data_fail,
cluster.getNamesystem().getBlockPoolId());
failedDir.setReadOnly();
data_fail = cluster.getInstanceStorageDir(1, 1);
failedDir = MiniDFSCluster.getFinalizedDir(data_fail,
cluster.getNamesystem().getBlockPoolId());
failedDir.setReadOnly();
final DataNode dn = cluster.getDataNodes().get(1);
dn.checkDiskError();
MetricsRecordBuilder rb = getMetrics(dn.getMetrics().name());
long volumeFailures = getLongCounter("VolumeFailures", rb);
assertEquals(2, volumeFailures);
}
}