HDFS-16402. Improve HeartbeatManager logic to avoid incorrect stats. (#3839). Contributed by tomscut.
Reviewed-by: He Xiaoqiao <hexiaoqiao@apache.org>
This commit is contained in:
parent
dae33cf935
commit
15b820c83c
@ -2691,7 +2691,7 @@ public long getProvidedCapacity() {
|
|||||||
void updateHeartbeat(DatanodeDescriptor node, StorageReport[] reports,
|
void updateHeartbeat(DatanodeDescriptor node, StorageReport[] reports,
|
||||||
long cacheCapacity, long cacheUsed, int xceiverCount, int failedVolumes,
|
long cacheCapacity, long cacheUsed, int xceiverCount, int failedVolumes,
|
||||||
VolumeFailureSummary volumeFailureSummary) {
|
VolumeFailureSummary volumeFailureSummary) {
|
||||||
|
BlockManagerFaultInjector.getInstance().mockAnException();
|
||||||
for (StorageReport report: reports) {
|
for (StorageReport report: reports) {
|
||||||
providedStorageMap.updateStorage(node, report.getStorage());
|
providedStorageMap.updateStorage(node, report.getStorage());
|
||||||
}
|
}
|
||||||
@ -2703,6 +2703,7 @@ void updateHeartbeatState(DatanodeDescriptor node,
|
|||||||
StorageReport[] reports, long cacheCapacity, long cacheUsed,
|
StorageReport[] reports, long cacheCapacity, long cacheUsed,
|
||||||
int xceiverCount, int failedVolumes,
|
int xceiverCount, int failedVolumes,
|
||||||
VolumeFailureSummary volumeFailureSummary) {
|
VolumeFailureSummary volumeFailureSummary) {
|
||||||
|
BlockManagerFaultInjector.getInstance().mockAnException();
|
||||||
for (StorageReport report: reports) {
|
for (StorageReport report: reports) {
|
||||||
providedStorageMap.updateStorage(node, report.getStorage());
|
providedStorageMap.updateStorage(node, report.getStorage());
|
||||||
}
|
}
|
||||||
|
@ -49,4 +49,8 @@ public void requestBlockReportLease(DatanodeDescriptor node, long leaseId) {
|
|||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public void removeBlockReportLease(DatanodeDescriptor node, long leaseId) {
|
public void removeBlockReportLease(DatanodeDescriptor node, long leaseId) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public void mockAnException() {
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -256,9 +256,12 @@ synchronized void updateHeartbeat(final DatanodeDescriptor node,
|
|||||||
int xceiverCount, int failedVolumes,
|
int xceiverCount, int failedVolumes,
|
||||||
VolumeFailureSummary volumeFailureSummary) {
|
VolumeFailureSummary volumeFailureSummary) {
|
||||||
stats.subtract(node);
|
stats.subtract(node);
|
||||||
blockManager.updateHeartbeat(node, reports, cacheCapacity, cacheUsed,
|
try {
|
||||||
xceiverCount, failedVolumes, volumeFailureSummary);
|
blockManager.updateHeartbeat(node, reports, cacheCapacity, cacheUsed,
|
||||||
stats.add(node);
|
xceiverCount, failedVolumes, volumeFailureSummary);
|
||||||
|
} finally {
|
||||||
|
stats.add(node);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
synchronized void updateLifeline(final DatanodeDescriptor node,
|
synchronized void updateLifeline(final DatanodeDescriptor node,
|
||||||
@ -266,13 +269,16 @@ synchronized void updateLifeline(final DatanodeDescriptor node,
|
|||||||
int xceiverCount, int failedVolumes,
|
int xceiverCount, int failedVolumes,
|
||||||
VolumeFailureSummary volumeFailureSummary) {
|
VolumeFailureSummary volumeFailureSummary) {
|
||||||
stats.subtract(node);
|
stats.subtract(node);
|
||||||
// This intentionally calls updateHeartbeatState instead of
|
try {
|
||||||
// updateHeartbeat, because we don't want to modify the
|
// This intentionally calls updateHeartbeatState instead of
|
||||||
// heartbeatedSinceRegistration flag. Arrival of a lifeline message does
|
// updateHeartbeat, because we don't want to modify the
|
||||||
// not count as arrival of the first heartbeat.
|
// heartbeatedSinceRegistration flag. Arrival of a lifeline message does
|
||||||
blockManager.updateHeartbeatState(node, reports, cacheCapacity, cacheUsed,
|
// not count as arrival of the first heartbeat.
|
||||||
xceiverCount, failedVolumes, volumeFailureSummary);
|
blockManager.updateHeartbeatState(node, reports, cacheCapacity, cacheUsed,
|
||||||
stats.add(node);
|
xceiverCount, failedVolumes, volumeFailureSummary);
|
||||||
|
} finally {
|
||||||
|
stats.add(node);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
synchronized void startDecommission(final DatanodeDescriptor node) {
|
synchronized void startDecommission(final DatanodeDescriptor node) {
|
||||||
|
@ -3943,4 +3943,8 @@ boolean isSlownodeByBlockPoolId(String bpId) {
|
|||||||
boolean isSlownode() {
|
boolean isSlownode() {
|
||||||
return blockPoolManager.isSlownode();
|
return blockPoolManager.isSlownode();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BlockPoolManager getBlockPoolManager() {
|
||||||
|
return blockPoolManager;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -25,6 +25,9 @@
|
|||||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LIFELINE_RPC_ADDRESS_KEY;
|
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LIFELINE_RPC_ADDRESS_KEY;
|
||||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY;
|
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerFaultInjector;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStatistics;
|
||||||
import org.apache.hadoop.hdfs.server.protocol.SlowDiskReports;
|
import org.apache.hadoop.hdfs.server.protocol.SlowDiskReports;
|
||||||
import static org.apache.hadoop.test.MetricsAsserts.getLongCounter;
|
import static org.apache.hadoop.test.MetricsAsserts.getLongCounter;
|
||||||
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
|
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
|
||||||
@ -95,6 +98,12 @@ public class TestDataNodeLifeline {
|
|||||||
private FSNamesystem namesystem;
|
private FSNamesystem namesystem;
|
||||||
private DataNode dn;
|
private DataNode dn;
|
||||||
private BPServiceActor bpsa;
|
private BPServiceActor bpsa;
|
||||||
|
private final BlockManagerFaultInjector injector = new BlockManagerFaultInjector() {
|
||||||
|
@Override
|
||||||
|
public void mockAnException() {
|
||||||
|
throw new UnknownError("Unknown exception");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setup() throws Exception {
|
public void setup() throws Exception {
|
||||||
@ -336,4 +345,49 @@ public T answer(InvocationOnMock invocation)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Mock an exception in HeartbeatManager#updateHeartbeat and HeartbeatManager#updateLifeline
|
||||||
|
* respectively, and trigger the heartbeat and lifeline in sequence. The capacityTotal obtained
|
||||||
|
* before and after this operation should be the same.
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testHeartbeatAndLifelineOnError() throws Exception {
|
||||||
|
final Configuration config = new HdfsConfiguration();
|
||||||
|
config.set(DFS_NAMENODE_LIFELINE_RPC_ADDRESS_KEY, "0.0.0.0:0");
|
||||||
|
|
||||||
|
try(MiniDFSCluster cluster =
|
||||||
|
new MiniDFSCluster.Builder(config).numDataNodes(1).build()) {
|
||||||
|
cluster.waitActive();
|
||||||
|
final FSNamesystem fsNamesystem = cluster.getNamesystem();
|
||||||
|
|
||||||
|
// Get capacityTotal before triggering heartbeat and lifeline.
|
||||||
|
DatanodeStatistics datanodeStatistics =
|
||||||
|
fsNamesystem.getBlockManager().getDatanodeManager().getDatanodeStatistics();
|
||||||
|
long capacityTotalBefore = datanodeStatistics.getCapacityTotal();
|
||||||
|
|
||||||
|
// Mock an exception in HeartbeatManager#updateHeartbeat and HeartbeatManager#updateLifeline.
|
||||||
|
BlockManagerFaultInjector.instance = injector;
|
||||||
|
DataNode dataNode = cluster.getDataNodes().get(0);
|
||||||
|
BlockPoolManager blockPoolManager = dataNode.getBlockPoolManager();
|
||||||
|
for (BPOfferService bpos : blockPoolManager.getAllNamenodeThreads()) {
|
||||||
|
if (bpos != null) {
|
||||||
|
for (BPServiceActor actor : bpos.getBPServiceActors()) {
|
||||||
|
try {
|
||||||
|
actor.triggerHeartbeatForTests();
|
||||||
|
actor.sendLifelineForTests();
|
||||||
|
} catch (Throwable e) {
|
||||||
|
assertTrue(e.getMessage().contains("Unknown exception"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get capacityTotal after triggering heartbeat and lifeline.
|
||||||
|
long capacityTotalAfter = datanodeStatistics.getCapacityTotal();
|
||||||
|
// The capacityTotal should be same.
|
||||||
|
assertEquals(capacityTotalBefore, capacityTotalAfter);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user