From be488b6070a124234c77f16193ee925d32ca9a20 Mon Sep 17 00:00:00 2001 From: Wei-Chiu Chuang Date: Wed, 3 Apr 2019 11:00:12 -0700 Subject: [PATCH] HDFS-10477. Stop decommission a rack of DataNodes caused NameNode fail over to standby. Contributed by yunjiong zhao and Wei-Chiu Chuang. --- .../server/blockmanagement/BlockManager.java | 44 ++++++++++++++----- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 4d3f1d1a51..4e351c021b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -4235,21 +4235,41 @@ void processExtraRedundancyBlocksOnInService( if (!isPopulatingReplQueues()) { return; } - final Iterator it = srcNode.getBlockIterator(); + int numExtraRedundancy = 0; - while(it.hasNext()) { - final BlockInfo block = it.next(); - if (block.isDeleted()) { - //Orphan block, will be handled eventually, skip + for (DatanodeStorageInfo datanodeStorageInfo : srcNode.getStorageInfos()) { + // the namesystem lock is released between iterations. Make sure the + // storage is not removed before continuing. + if (srcNode.getStorageInfo(datanodeStorageInfo.getStorageID()) == null) { continue; } - int expectedReplication = this.getExpectedRedundancyNum(block); - NumberReplicas num = countNodes(block); - if (shouldProcessExtraRedundancy(num, expectedReplication)) { - // extra redundancy block - processExtraRedundancyBlock(block, (short) expectedReplication, null, - null); - numExtraRedundancy++; + final Iterator it = datanodeStorageInfo.getBlockIterator(); + while(it.hasNext()) { + final BlockInfo block = it.next(); + if (block.isDeleted()) { + //Orphan block, will be handled eventually, skip + continue; + } + int expectedReplication = this.getExpectedRedundancyNum(block); + NumberReplicas num = countNodes(block); + if (shouldProcessExtraRedundancy(num, expectedReplication)) { + // extra redundancy block + processExtraRedundancyBlock(block, (short) expectedReplication, null, + null); + numExtraRedundancy++; + } + } + // When called by tests like TestDefaultBlockPlacementPolicy. + // testPlacementWithLocalRackNodesDecommissioned, it is not protected by + // lock, only when called by DatanodeManager.refreshNodes have writeLock + if (namesystem.hasWriteLock()) { + namesystem.writeUnlock(); + try { + Thread.sleep(1); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + namesystem.writeLock(); } } LOG.info("Invalidated {} extra redundancy blocks on {} after "