HDFS-16846. EC: Only EC blocks should be effected by max-streams-hard-limit configuration (#5143)

Signed-off-by: Takanobu Asanuma <tasanuma@apache.org>
This commit is contained in:
caozhiqiang 2022-11-29 09:51:21 +08:00 committed by GitHub
parent 909aeca86c
commit 35c65005d0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 107 additions and 50 deletions

View File

@ -197,8 +197,10 @@ public Type getType() {
/** A queue of blocks to be replicated by this datanode */ /** A queue of blocks to be replicated by this datanode */
private final BlockQueue<BlockTargetPair> replicateBlocks = private final BlockQueue<BlockTargetPair> replicateBlocks =
new BlockQueue<>(); new BlockQueue<>();
/** A queue of blocks to be erasure coded by this datanode */ /** A queue of ec blocks to be replicated by this datanode. */
private final BlockQueue<BlockECReconstructionInfo> erasurecodeBlocks = private final BlockQueue<BlockTargetPair> ecBlocksToBeReplicated = new BlockQueue<>();
/** A queue of ec blocks to be erasure coded by this datanode. */
private final BlockQueue<BlockECReconstructionInfo> ecBlocksToBeErasureCoded =
new BlockQueue<>(); new BlockQueue<>();
/** A queue of blocks to be recovered by this datanode */ /** A queue of blocks to be recovered by this datanode */
private final BlockQueue<BlockInfo> recoverBlocks = new BlockQueue<>(); private final BlockQueue<BlockInfo> recoverBlocks = new BlockQueue<>();
@ -358,7 +360,8 @@ public void clearBlockQueues() {
} }
this.recoverBlocks.clear(); this.recoverBlocks.clear();
this.replicateBlocks.clear(); this.replicateBlocks.clear();
this.erasurecodeBlocks.clear(); this.ecBlocksToBeReplicated.clear();
this.ecBlocksToBeErasureCoded.clear();
// pendingCached, cached, and pendingUncached are protected by the // pendingCached, cached, and pendingUncached are protected by the
// FSN lock. // FSN lock.
this.pendingCached.clear(); this.pendingCached.clear();
@ -678,6 +681,15 @@ public void addBlockToBeReplicated(Block block,
replicateBlocks.offer(new BlockTargetPair(block, targets)); replicateBlocks.offer(new BlockTargetPair(block, targets));
} }
/**
* Store ec block to be replicated work.
*/
@VisibleForTesting
public void addECBlockToBeReplicated(Block block, DatanodeStorageInfo[] targets) {
assert (block != null && targets != null && targets.length > 0);
ecBlocksToBeReplicated.offer(new BlockTargetPair(block, targets));
}
/** /**
* Store block erasure coding work. * Store block erasure coding work.
*/ */
@ -687,9 +699,9 @@ void addBlockToBeErasureCoded(ExtendedBlock block,
assert (block != null && sources != null && sources.length > 0); assert (block != null && sources != null && sources.length > 0);
BlockECReconstructionInfo task = new BlockECReconstructionInfo(block, BlockECReconstructionInfo task = new BlockECReconstructionInfo(block,
sources, targets, liveBlockIndices, excludeReconstrutedIndices, ecPolicy); sources, targets, liveBlockIndices, excludeReconstrutedIndices, ecPolicy);
erasurecodeBlocks.offer(task); ecBlocksToBeErasureCoded.offer(task);
BlockManager.LOG.debug("Adding block reconstruction task " + task + "to " BlockManager.LOG.debug("Adding block reconstruction task " + task + "to "
+ getName() + ", current queue size is " + erasurecodeBlocks.size()); + getName() + ", current queue size is " + ecBlocksToBeErasureCoded.size());
} }
/** /**
@ -720,7 +732,8 @@ void addBlocksToBeInvalidated(List<Block> blocklist) {
* The number of work items that are pending to be replicated. * The number of work items that are pending to be replicated.
*/ */
int getNumberOfBlocksToBeReplicated() { int getNumberOfBlocksToBeReplicated() {
return pendingReplicationWithoutTargets + replicateBlocks.size(); return pendingReplicationWithoutTargets + replicateBlocks.size()
+ ecBlocksToBeReplicated.size();
} }
/** /**
@ -728,7 +741,15 @@ int getNumberOfBlocksToBeReplicated() {
*/ */
@VisibleForTesting @VisibleForTesting
public int getNumberOfBlocksToBeErasureCoded() { public int getNumberOfBlocksToBeErasureCoded() {
return erasurecodeBlocks.size(); return ecBlocksToBeErasureCoded.size();
}
/**
* The number of ec work items that are pending to be replicated.
*/
@VisibleForTesting
public int getNumberOfECBlocksToBeReplicated() {
return ecBlocksToBeReplicated.size();
} }
@VisibleForTesting @VisibleForTesting
@ -740,9 +761,13 @@ List<BlockTargetPair> getReplicationCommand(int maxTransfers) {
return replicateBlocks.poll(maxTransfers); return replicateBlocks.poll(maxTransfers);
} }
List<BlockTargetPair> getECReplicatedCommand(int maxTransfers) {
return ecBlocksToBeReplicated.poll(maxTransfers);
}
public List<BlockECReconstructionInfo> getErasureCodeCommand( public List<BlockECReconstructionInfo> getErasureCodeCommand(
int maxTransfers) { int maxTransfers) {
return erasurecodeBlocks.poll(maxTransfers); return ecBlocksToBeErasureCoded.poll(maxTransfers);
} }
public BlockInfo[] getLeaseRecoveryCommand(int maxTransfers) { public BlockInfo[] getLeaseRecoveryCommand(int maxTransfers) {
@ -994,7 +1019,11 @@ public String dumpDatanode() {
if (repl > 0) { if (repl > 0) {
sb.append(" ").append(repl).append(" blocks to be replicated;"); sb.append(" ").append(repl).append(" blocks to be replicated;");
} }
int ec = erasurecodeBlocks.size(); int ecRepl = ecBlocksToBeReplicated.size();
if (ecRepl > 0) {
sb.append(" ").append(ecRepl).append(" ec blocks to be replicated;");
}
int ec = ecBlocksToBeErasureCoded.size();
if(ec > 0) { if(ec > 0) {
sb.append(" ").append(ec).append(" blocks to be erasure coded;"); sb.append(" ").append(ec).append(" blocks to be erasure coded;");
} }

View File

@ -1825,28 +1825,41 @@ public DatanodeCommand[] handleHeartbeat(DatanodeRegistration nodeReg,
// Allocate _approximately_ maxTransfers pending tasks to DataNode. // Allocate _approximately_ maxTransfers pending tasks to DataNode.
// NN chooses pending tasks based on the ratio between the lengths of // NN chooses pending tasks based on the ratio between the lengths of
// replication and erasure-coded block queues. // replication and erasure-coded block queues.
int totalReplicateBlocks = nodeinfo.getNumberOfReplicateBlocks(); int replicationBlocks = nodeinfo.getNumberOfReplicateBlocks();
int totalECBlocks = nodeinfo.getNumberOfBlocksToBeErasureCoded(); int ecBlocksToBeReplicated = nodeinfo.getNumberOfECBlocksToBeReplicated();
int totalBlocks = totalReplicateBlocks + totalECBlocks; int ecBlocksToBeErasureCoded = nodeinfo.getNumberOfBlocksToBeErasureCoded();
int totalBlocks = replicationBlocks + ecBlocksToBeReplicated + ecBlocksToBeErasureCoded;
if (totalBlocks > 0) { if (totalBlocks > 0) {
int maxTransfers; int maxTransfers = blockManager.getMaxReplicationStreams() - xmitsInProgress;
int maxECReplicatedTransfers;
if (nodeinfo.isDecommissionInProgress()) { if (nodeinfo.isDecommissionInProgress()) {
maxTransfers = blockManager.getReplicationStreamsHardLimit() maxECReplicatedTransfers = blockManager.getReplicationStreamsHardLimit()
- xmitsInProgress; - xmitsInProgress;
} else { } else {
maxTransfers = blockManager.getMaxReplicationStreams() maxECReplicatedTransfers = maxTransfers;
- xmitsInProgress;
} }
int numReplicationTasks = (int) Math.ceil( int numReplicationTasks = (int) Math.ceil(
(double) (totalReplicateBlocks * maxTransfers) / totalBlocks); (double) (replicationBlocks * maxTransfers) / totalBlocks);
int numECTasks = (int) Math.ceil( int numEcReplicatedTasks = (int) Math.ceil(
(double) (totalECBlocks * maxTransfers) / totalBlocks); (double) (ecBlocksToBeReplicated * maxECReplicatedTransfers) / totalBlocks);
LOG.debug("Pending replication tasks: {} erasure-coded tasks: {}.", int numECReconstructedTasks = (int) Math.ceil(
numReplicationTasks, numECTasks); (double) (ecBlocksToBeErasureCoded * maxTransfers) / totalBlocks);
LOG.debug("Pending replication tasks: {} ec to be replicated tasks: {} " +
"ec reconstruction tasks: {}.",
numReplicationTasks, numEcReplicatedTasks, numECReconstructedTasks);
// check pending replication tasks // check pending replication tasks
List<BlockTargetPair> pendingList = nodeinfo.getReplicationCommand( List<BlockTargetPair> pendingReplicationList = nodeinfo.getReplicationCommand(
numReplicationTasks); numReplicationTasks);
if (pendingList != null && !pendingList.isEmpty()) { List<BlockTargetPair> pendingECReplicatedList = nodeinfo.getECReplicatedCommand(
numEcReplicatedTasks);
List<BlockTargetPair> pendingList = new ArrayList<BlockTargetPair>();
if(pendingReplicationList != null && !pendingReplicationList.isEmpty()) {
pendingList.addAll(pendingReplicationList);
}
if(pendingECReplicatedList != null && !pendingECReplicatedList.isEmpty()) {
pendingList.addAll(pendingECReplicatedList);
}
if (!pendingList.isEmpty()) {
// If the block is deleted, the block size will become // If the block is deleted, the block size will become
// BlockCommand.NO_ACK (LONG.MAX_VALUE) . This kind of block we don't // BlockCommand.NO_ACK (LONG.MAX_VALUE) . This kind of block we don't
// need // need
@ -1868,7 +1881,7 @@ public DatanodeCommand[] handleHeartbeat(DatanodeRegistration nodeReg,
} }
// check pending erasure coding tasks // check pending erasure coding tasks
List<BlockECReconstructionInfo> pendingECList = nodeinfo List<BlockECReconstructionInfo> pendingECList = nodeinfo
.getErasureCodeCommand(numECTasks); .getErasureCodeCommand(numECReconstructedTasks);
if (pendingECList != null && !pendingECList.isEmpty()) { if (pendingECList != null && !pendingECList.isEmpty()) {
cmds.add(new BlockECReconstructionCommand( cmds.add(new BlockECReconstructionCommand(
DNA_ERASURE_CODING_RECONSTRUCTION, pendingECList)); DNA_ERASURE_CODING_RECONSTRUCTION, pendingECList));

View File

@ -164,7 +164,7 @@ private void createReplicationWork(int sourceIndex,
stripedBlk.getDataBlockNum(), blockIndex); stripedBlk.getDataBlockNum(), blockIndex);
final Block targetBlk = new Block(stripedBlk.getBlockId() + blockIndex, final Block targetBlk = new Block(stripedBlk.getBlockId() + blockIndex,
internBlkLen, stripedBlk.getGenerationStamp()); internBlkLen, stripedBlk.getGenerationStamp());
source.addBlockToBeReplicated(targetBlk, source.addECBlockToBeReplicated(targetBlk,
new DatanodeStorageInfo[] {target}); new DatanodeStorageInfo[] {target});
LOG.debug("Add replication task from source {} to " LOG.debug("Add replication task from source {} to "
+ "target {} for EC block {}", source, target, targetBlk); + "target {} for EC block {}", source, target, targetBlk);

View File

@ -759,7 +759,7 @@ public void testDecommissionWithFailedReplicating() throws Exception {
DatanodeInfo extraDn = getDatanodeOutOfTheBlock(blk); DatanodeInfo extraDn = getDatanodeOutOfTheBlock(blk);
DatanodeDescriptor target = bm.getDatanodeManager() DatanodeDescriptor target = bm.getDatanodeManager()
.getDatanode(extraDn.getDatanodeUuid()); .getDatanode(extraDn.getDatanodeUuid());
dn0.addBlockToBeReplicated(targetBlk, dn0.addECBlockToBeReplicated(targetBlk,
new DatanodeStorageInfo[] {target.getStorageInfos()[0]}); new DatanodeStorageInfo[] {target.getStorageInfos()[0]});
// dn0 replicates in success // dn0 replicates in success
@ -883,7 +883,7 @@ public void testDecommissionWithMissingBlock() throws Exception {
.getDatanode(extraDn.getDatanodeUuid()); .getDatanode(extraDn.getDatanodeUuid());
DatanodeDescriptor dnStartIndexDecommission = bm.getDatanodeManager() DatanodeDescriptor dnStartIndexDecommission = bm.getDatanodeManager()
.getDatanode(dnLocs[decommNodeIndex].getDatanodeUuid()); .getDatanode(dnLocs[decommNodeIndex].getDatanodeUuid());
dnStartIndexDecommission.addBlockToBeReplicated(targetBlk, dnStartIndexDecommission.addECBlockToBeReplicated(targetBlk,
new DatanodeStorageInfo[] {target.getStorageInfos()[0]}); new DatanodeStorageInfo[] {target.getStorageInfos()[0]});
// Wait for replication success. // Wait for replication success.
@ -972,7 +972,7 @@ public void testCountNodes() throws Exception{
DatanodeInfo extraDn = getDatanodeOutOfTheBlock(blk); DatanodeInfo extraDn = getDatanodeOutOfTheBlock(blk);
DatanodeDescriptor target = bm.getDatanodeManager() DatanodeDescriptor target = bm.getDatanodeManager()
.getDatanode(extraDn.getDatanodeUuid()); .getDatanode(extraDn.getDatanodeUuid());
dn0.addBlockToBeReplicated(targetBlk, dn0.addECBlockToBeReplicated(targetBlk,
new DatanodeStorageInfo[] {target.getStorageInfos()[0]}); new DatanodeStorageInfo[] {target.getStorageInfos()[0]});
// dn0 replicates in success // dn0 replicates in success

View File

@ -967,20 +967,22 @@ public void testRemoveIncludedNode() throws IOException {
* Verify the correctness of pending recovery process. * Verify the correctness of pending recovery process.
* *
* @param numReplicationBlocks the number of replication blocks in the queue. * @param numReplicationBlocks the number of replication blocks in the queue.
* @param numECBlocks number of EC blocks in the queue. * @param numEcBlocksToBeReplicated the number of EC blocks to be replicated in the queue.
* @param numBlocksToBeErasureCoded number of EC blocks to be erasure coded in the queue.
* @param maxTransfers the maxTransfer value. * @param maxTransfers the maxTransfer value.
* @param maxTransfersHardLimit the maxTransfer hard limit value. * @param maxTransfersHardLimit the maxTransfer hard limit value.
* @param numReplicationTasks the number of replication tasks polled from * @param numReplicationTasks the number of replication tasks polled from the queue.
* @param numECTasksToBeReplicated the number of EC tasks to be replicated polled from the queue.
* @param numECTasksToBeErasureCoded the number of EC tasks to be erasure coded polled from
* the queue. * the queue.
* @param numECTasks the number of EC tasks polled from the queue.
* @param isDecommissioning if the node is in the decommissioning process. * @param isDecommissioning if the node is in the decommissioning process.
* *
* @throws IOException * @throws IOException
*/ */
private void verifyPendingRecoveryTasks( private void verifyPendingRecoveryTasks(
int numReplicationBlocks, int numECBlocks, int numReplicationBlocks, int numEcBlocksToBeReplicated, int numBlocksToBeErasureCoded,
int maxTransfers, int maxTransfersHardLimit, int maxTransfers, int maxTransfersHardLimit, int numReplicationTasks,
int numReplicationTasks, int numECTasks, boolean isDecommissioning) int numECTasksToBeReplicated, int numECTasksToBeErasureCoded, boolean isDecommissioning)
throws IOException { throws IOException {
FSNamesystem fsn = Mockito.mock(FSNamesystem.class); FSNamesystem fsn = Mockito.mock(FSNamesystem.class);
Mockito.when(fsn.hasWriteLock()).thenReturn(true); Mockito.when(fsn.hasWriteLock()).thenReturn(true);
@ -1009,13 +1011,25 @@ private void verifyPendingRecoveryTasks(
.thenReturn(tasks); .thenReturn(tasks);
} }
if (numECBlocks > 0) { if (numEcBlocksToBeReplicated > 0) {
Mockito.when(nodeInfo.getNumberOfECBlocksToBeReplicated())
.thenReturn(numEcBlocksToBeReplicated);
List<BlockTargetPair> ecReplicatedTasks =
Collections.nCopies(
Math.min(numECTasksToBeReplicated, numEcBlocksToBeReplicated),
new BlockTargetPair(null, null));
Mockito.when(nodeInfo.getECReplicatedCommand(numECTasksToBeReplicated))
.thenReturn(ecReplicatedTasks);
}
if (numBlocksToBeErasureCoded > 0) {
Mockito.when(nodeInfo.getNumberOfBlocksToBeErasureCoded()) Mockito.when(nodeInfo.getNumberOfBlocksToBeErasureCoded())
.thenReturn(numECBlocks); .thenReturn(numBlocksToBeErasureCoded);
List<BlockECReconstructionInfo> tasks = List<BlockECReconstructionInfo> tasks =
Collections.nCopies(numECTasks, null); Collections.nCopies(numECTasksToBeErasureCoded, null);
Mockito.when(nodeInfo.getErasureCodeCommand(numECTasks)) Mockito.when(nodeInfo.getErasureCodeCommand(numECTasksToBeErasureCoded))
.thenReturn(tasks); .thenReturn(tasks);
} }
@ -1026,42 +1040,43 @@ private void verifyPendingRecoveryTasks(
SlowPeerReports.EMPTY_REPORT, SlowDiskReports.EMPTY_REPORT); SlowPeerReports.EMPTY_REPORT, SlowDiskReports.EMPTY_REPORT);
long expectedNumCmds = Arrays.stream( long expectedNumCmds = Arrays.stream(
new int[]{numReplicationTasks, numECTasks}) new int[]{numReplicationTasks + numECTasksToBeReplicated, numECTasksToBeErasureCoded})
.filter(x -> x > 0) .filter(x -> x > 0)
.count(); .count();
assertEquals(expectedNumCmds, cmds.length); assertEquals(expectedNumCmds, cmds.length);
int idx = 0; int idx = 0;
if (numReplicationTasks > 0) { if (numReplicationTasks > 0 || numECTasksToBeReplicated > 0) {
assertTrue(cmds[idx] instanceof BlockCommand); assertTrue(cmds[idx] instanceof BlockCommand);
BlockCommand cmd = (BlockCommand) cmds[0]; BlockCommand cmd = (BlockCommand) cmds[0];
assertEquals(numReplicationTasks, cmd.getBlocks().length); assertEquals(numReplicationTasks + numECTasksToBeReplicated, cmd.getBlocks().length);
assertEquals(numReplicationTasks, cmd.getTargets().length); assertEquals(numReplicationTasks + numECTasksToBeReplicated, cmd.getTargets().length);
idx++; idx++;
} }
if (numECTasks > 0) { if (numECTasksToBeErasureCoded > 0) {
assertTrue(cmds[idx] instanceof BlockECReconstructionCommand); assertTrue(cmds[idx] instanceof BlockECReconstructionCommand);
BlockECReconstructionCommand cmd = BlockECReconstructionCommand cmd =
(BlockECReconstructionCommand) cmds[idx]; (BlockECReconstructionCommand) cmds[idx];
assertEquals(numECTasks, cmd.getECTasks().size()); assertEquals(numECTasksToBeErasureCoded, cmd.getECTasks().size());
} }
Mockito.verify(nodeInfo).getReplicationCommand(numReplicationTasks); Mockito.verify(nodeInfo).getReplicationCommand(numReplicationTasks);
Mockito.verify(nodeInfo).getErasureCodeCommand(numECTasks); Mockito.verify(nodeInfo).getECReplicatedCommand(numECTasksToBeReplicated);
Mockito.verify(nodeInfo).getErasureCodeCommand(numECTasksToBeErasureCoded);
} }
@Test @Test
public void testPendingRecoveryTasks() throws IOException { public void testPendingRecoveryTasks() throws IOException {
// Tasks are slitted according to the ratio between queue lengths. // Tasks are slitted according to the ratio between queue lengths.
verifyPendingRecoveryTasks(20, 20, 20, 30, 10, 10, false); verifyPendingRecoveryTasks(20, 0, 20, 20, 30, 10, 0, 10, false);
verifyPendingRecoveryTasks(40, 10, 20, 30, 16, 4, false); verifyPendingRecoveryTasks(40, 0, 10, 20, 30, 16, 0, 4, false);
// Approximately load tasks if the ratio between queue length is large. // Approximately load tasks if the ratio between queue length is large.
verifyPendingRecoveryTasks(400, 1, 20, 30, 20, 1, false); verifyPendingRecoveryTasks(400, 0, 1, 20, 30, 20, 0, 1, false);
// Tasks use dfs.namenode.replication.max-streams-hard-limit for decommissioning node // Tasks use dfs.namenode.replication.max-streams-hard-limit for decommissioning node
verifyPendingRecoveryTasks(30, 30, 20, 30, 15, 15, true); verifyPendingRecoveryTasks(20, 10, 10, 20, 40, 10, 10, 5, true);
} }
@Test @Test