HDFS-15158. The number of failed volumes mismatch with volumeFailures of Datanode metrics. Contributed by Yang Yun.
This commit is contained in:
parent
28f730b317
commit
6191d4b4a0
@ -2215,7 +2215,7 @@ public class DataNode extends ReconfigurableBase
|
||||
});
|
||||
}
|
||||
|
||||
private void handleDiskError(String failedVolumes) {
|
||||
private void handleDiskError(String failedVolumes, int failedNumber) {
|
||||
final boolean hasEnoughResources = data.hasEnoughResource();
|
||||
LOG.warn("DataNode.handleDiskError on: " +
|
||||
"[{}] Keep Running: {}", failedVolumes, hasEnoughResources);
|
||||
@ -2224,7 +2224,7 @@ public class DataNode extends ReconfigurableBase
|
||||
// shutdown the DN completely.
|
||||
int dpError = hasEnoughResources ? DatanodeProtocol.DISK_ERROR
|
||||
: DatanodeProtocol.FATAL_DISK_ERROR;
|
||||
metrics.incrVolumeFailures();
|
||||
metrics.incrVolumeFailures(failedNumber);
|
||||
|
||||
//inform NameNodes
|
||||
for(BPOfferService bpos: blockPoolManager.getAllNamenodeThreads()) {
|
||||
@ -3452,8 +3452,8 @@ public class DataNode extends ReconfigurableBase
|
||||
}
|
||||
|
||||
data.handleVolumeFailures(unhealthyVolumes);
|
||||
Set<StorageLocation> unhealthyLocations = new HashSet<>(
|
||||
unhealthyVolumes.size());
|
||||
int failedNumber = unhealthyVolumes.size();
|
||||
Set<StorageLocation> unhealthyLocations = new HashSet<>(failedNumber);
|
||||
|
||||
StringBuilder sb = new StringBuilder("DataNode failed volumes:");
|
||||
for (FsVolumeSpi vol : unhealthyVolumes) {
|
||||
@ -3468,8 +3468,8 @@ public class DataNode extends ReconfigurableBase
|
||||
LOG.warn("Error occurred when removing unhealthy storage dirs", e);
|
||||
}
|
||||
LOG.debug("{}", sb);
|
||||
// send blockreport regarding volume failure
|
||||
handleDiskError(sb.toString());
|
||||
// send blockreport regarding volume failure
|
||||
handleDiskError(sb.toString(), failedNumber);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -399,9 +399,9 @@ public class DataNodeMetrics {
|
||||
remoteBytesRead.incr(size);
|
||||
}
|
||||
}
|
||||
|
||||
public void incrVolumeFailures() {
|
||||
volumeFailures.incr();
|
||||
|
||||
public void incrVolumeFailures(int size) {
|
||||
volumeFailures.incr(size);
|
||||
}
|
||||
|
||||
public void incrDatanodeNetworkErrors() {
|
||||
|
@ -17,6 +17,8 @@
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.server.datanode;
|
||||
|
||||
import static org.apache.hadoop.test.MetricsAsserts.getLongCounter;
|
||||
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
|
||||
import static org.apache.hadoop.test.PlatformAssumptions.assumeNotWindows;
|
||||
import static org.hamcrest.core.Is.is;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
@ -77,6 +79,7 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
|
||||
import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
|
||||
import org.apache.hadoop.net.NetUtils;
|
||||
import org.apache.hadoop.security.token.Token;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
@ -947,4 +950,27 @@ public class TestDataNodeVolumeFailure {
|
||||
}
|
||||
}, 10, 30 * 1000);
|
||||
}
|
||||
|
||||
/*
|
||||
* Fail two volumes, and check the metrics of VolumeFailures
|
||||
*/
|
||||
@Test
|
||||
public void testVolumeFailureTwo() throws Exception {
|
||||
// fail two volumes
|
||||
data_fail = cluster.getInstanceStorageDir(1, 0);
|
||||
failedDir = MiniDFSCluster.getFinalizedDir(data_fail,
|
||||
cluster.getNamesystem().getBlockPoolId());
|
||||
failedDir.setReadOnly();
|
||||
data_fail = cluster.getInstanceStorageDir(1, 1);
|
||||
failedDir = MiniDFSCluster.getFinalizedDir(data_fail,
|
||||
cluster.getNamesystem().getBlockPoolId());
|
||||
failedDir.setReadOnly();
|
||||
|
||||
final DataNode dn = cluster.getDataNodes().get(1);
|
||||
dn.checkDiskError();
|
||||
|
||||
MetricsRecordBuilder rb = getMetrics(dn.getMetrics().name());
|
||||
long volumeFailures = getLongCounter("VolumeFailures", rb);
|
||||
assertEquals(2, volumeFailures);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user