HDFS-17358. EC: infinite lease recovery caused by the length of RWR equals to zero or datanode does not have the replica. (#6509). Contributed by farmmamba.

Reviewed-by: Tao Li <tomscut@apache.org>
Reviewed-by: Haiyang Hu <haiyang.hu@shopee.com>
Signed-off-by:  Shuyan Zhang <zhangshuyan@apache.org>
This commit is contained in:
hfutatzhanghb 2024-02-27 10:19:57 +08:00 committed by GitHub
parent a897e745f5
commit 15af52954f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 66 additions and 11 deletions

View File

@ -386,6 +386,8 @@ protected void recover() throws IOException {
Map<Long, BlockRecord> syncBlocks = new HashMap<>(locs.length); Map<Long, BlockRecord> syncBlocks = new HashMap<>(locs.length);
final int dataBlkNum = ecPolicy.getNumDataUnits(); final int dataBlkNum = ecPolicy.getNumDataUnits();
final int totalBlkNum = dataBlkNum + ecPolicy.getNumParityUnits(); final int totalBlkNum = dataBlkNum + ecPolicy.getNumParityUnits();
int zeroLenReplicaCnt = 0;
int dnNotHaveReplicaCnt = 0;
//check generation stamps //check generation stamps
for (int i = 0; i < locs.length; i++) { for (int i = 0; i < locs.length; i++) {
DatanodeID id = locs[i]; DatanodeID id = locs[i];
@ -419,10 +421,14 @@ protected void recover() throws IOException {
if (info == null) { if (info == null) {
LOG.debug("Block recovery: DataNode: {} does not have " + LOG.debug("Block recovery: DataNode: {} does not have " +
"replica for block: (block={}, internalBlk={})", id, block, internalBlk); "replica for block: (block={}, internalBlk={})", id, block, internalBlk);
dnNotHaveReplicaCnt++;
} else { } else {
LOG.debug("Block recovery: Ignored replica with invalid " LOG.debug("Block recovery: Ignored replica with invalid "
+ "generation stamp or length: {} from DataNode: {} by block: {}", + "generation stamp or length: {} from DataNode: {} by block: {}",
info, id, block); info, id, block);
if (info.getNumBytes() == 0) {
zeroLenReplicaCnt++;
}
} }
} }
} catch (RecoveryInProgressException ripE) { } catch (RecoveryInProgressException ripE) {
@ -436,9 +442,18 @@ protected void recover() throws IOException {
"datanode={})", block, internalBlk, id, e); "datanode={})", block, internalBlk, id, e);
} }
} }
checkLocations(syncBlocks.size());
final long safeLength = getSafeLength(syncBlocks); final long safeLength;
if (dnNotHaveReplicaCnt + zeroLenReplicaCnt <= locs.length - ecPolicy.getNumDataUnits()) {
checkLocations(syncBlocks.size());
safeLength = getSafeLength(syncBlocks);
} else {
safeLength = 0;
LOG.warn("Block recovery: {} datanodes do not have the replica of block {}." +
" {} datanodes have zero-length replica. Will remove this block.",
dnNotHaveReplicaCnt, block, zeroLenReplicaCnt);
}
LOG.debug("Recovering block {}, length={}, safeLength={}, syncList={}", block, LOG.debug("Recovering block {}, length={}, safeLength={}, syncList={}", block,
block.getNumBytes(), safeLength, syncBlocks); block.getNumBytes(), safeLength, syncBlocks);
@ -452,11 +467,13 @@ protected void recover() throws IOException {
rurList.add(r); rurList.add(r);
} }
} }
assert rurList.size() >= dataBlkNum : "incorrect safe length";
if (safeLength > 0) {
Preconditions.checkArgument(rurList.size() >= dataBlkNum, "incorrect safe length");
// Recovery the striped block by truncating internal blocks to the safe // Recovery the striped block by truncating internal blocks to the safe
// length. Abort if there is any failure in this step. // length. Abort if there is any failure in this step.
truncatePartialBlock(rurList, safeLength); truncatePartialBlock(rurList, safeLength);
}
// notify Namenode the new size and locations // notify Namenode the new size and locations
final DatanodeID[] newLocs = new DatanodeID[totalBlkNum]; final DatanodeID[] newLocs = new DatanodeID[totalBlkNum];
@ -469,11 +486,20 @@ protected void recover() throws IOException {
int index = (int) (r.rInfo.getBlockId() & int index = (int) (r.rInfo.getBlockId() &
HdfsServerConstants.BLOCK_GROUP_INDEX_MASK); HdfsServerConstants.BLOCK_GROUP_INDEX_MASK);
newLocs[index] = r.id; newLocs[index] = r.id;
if (r.storageID != null) {
newStorages[index] = r.storageID; newStorages[index] = r.storageID;
} }
}
ExtendedBlock newBlock = new ExtendedBlock(bpid, block.getBlockId(), ExtendedBlock newBlock = new ExtendedBlock(bpid, block.getBlockId(),
safeLength, recoveryId); safeLength, recoveryId);
DatanodeProtocolClientSideTranslatorPB nn = getActiveNamenodeForBP(bpid); DatanodeProtocolClientSideTranslatorPB nn = getActiveNamenodeForBP(bpid);
if (safeLength == 0) {
nn.commitBlockSynchronization(block, newBlock.getGenerationStamp(),
newBlock.getNumBytes(), true, true, newLocs, newStorages);
LOG.info("After block recovery, the length of new block is 0. " +
"Will remove this block: {} from file.", newBlock);
return;
}
nn.commitBlockSynchronization(block, newBlock.getGenerationStamp(), nn.commitBlockSynchronization(block, newBlock.getGenerationStamp(),
newBlock.getNumBytes(), true, false, newLocs, newStorages); newBlock.getNumBytes(), true, false, newLocs, newStorages);
} }
@ -527,8 +553,8 @@ long getSafeLength(Map<Long, BlockRecord> syncBlocks) {
private void checkLocations(int locationCount) private void checkLocations(int locationCount)
throws IOException { throws IOException {
if (locationCount < ecPolicy.getNumDataUnits()) { if (locationCount < ecPolicy.getNumDataUnits()) {
throw new IOException(block + " has no enough internal blocks" + throw new IOException(block + " has no enough internal blocks(current: " + locationCount +
", unable to start recovery. Locations=" + Arrays.asList(locs)); "), unable to start recovery. Locations=" + Arrays.asList(locs));
} }
} }
} }

View File

@ -133,8 +133,8 @@ private BlockReader createBlockReader(long offsetInBlock) {
block.getNumBytes() - offsetInBlock, true, "", peer, source, block.getNumBytes() - offsetInBlock, true, "", peer, source,
null, stripedReader.getCachingStrategy(), -1, conf); null, stripedReader.getCachingStrategy(), -1, conf);
} catch (IOException e) { } catch (IOException e) {
LOG.info("Exception while creating remote block reader, datanode {}", LOG.info("Exception while creating remote block reader for {}, datanode {}",
source, e); block, source, e);
IOUtils.closeStream(peer); IOUtils.closeStream(peer);
return null; return null;
} }

View File

@ -259,6 +259,35 @@ public void testSafeLength() {
checkSafeLength(1024 * 1024 * 1024, 6442450944L); // Length of: 1 GiB checkSafeLength(1024 * 1024 * 1024, 6442450944L); // Length of: 1 GiB
} }
/**
* 1. Write 1MB data, then flush it.
* 2. Mock client quiet exceptionally.
* 3. Trigger lease recovery.
* 4. Lease recovery successfully.
*/
@Test
public void testLeaseRecoveryWithManyZeroLengthReplica() {
int curCellSize = (int)1024 * 1024;
try {
final FSDataOutputStream out = dfs.create(p);
final DFSStripedOutputStream stripedOut = (DFSStripedOutputStream) out
.getWrappedStream();
for (int pos = 0; pos < curCellSize; pos++) {
out.write(StripedFileTestUtil.getByte(pos));
}
for (int i = 0; i < dataBlocks + parityBlocks; i++) {
StripedDataStreamer s = stripedOut.getStripedDataStreamer(i);
waitStreamerAllAcked(s);
stopBlockStream(s);
}
recoverLease();
LOG.info("Trigger recover lease manually successfully.");
} catch (Throwable e) {
String msg = "failed testCase" + StringUtils.stringifyException(e);
Assert.fail(msg);
}
}
private void checkSafeLength(int blockLength, long expectedSafeLength) { private void checkSafeLength(int blockLength, long expectedSafeLength) {
int[] blockLengths = new int[]{blockLength, blockLength, blockLength, blockLength, int[] blockLengths = new int[]{blockLength, blockLength, blockLength, blockLength,
blockLength, blockLength}; blockLength, blockLength};