HDFS-11446. TestMaintenanceState#testWithNNAndDNRestart fails intermittently. Contributed by Yiqun Lin.

This commit is contained in:
Yiqun Lin 2017-05-28 11:23:32 +08:00
parent 89bb8bfe58
commit 31058b243e

View File

@ -18,7 +18,6 @@
package org.apache.hadoop.hdfs; package org.apache.hadoop.hdfs;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail; import static org.junit.Assert.fail;
@ -30,12 +29,7 @@
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.google.common.collect.Lists;
import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.junit.Assert;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.client.HdfsDataInputStream; import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
@ -48,8 +42,16 @@
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.util.Time; import org.apache.hadoop.util.Time;
import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Supplier;
import com.google.common.collect.Lists;
/** /**
* This class tests node maintenance. * This class tests node maintenance.
@ -125,8 +127,8 @@ public void testTakeNodeOutOfEnteringMaintenance() throws Exception {
// When node is in ENTERING_MAINTENANCE state, it can still serve read // When node is in ENTERING_MAINTENANCE state, it can still serve read
// requests // requests
assertNull(checkWithRetry(ns, fileSys, file, replicas, null, checkWithRetry(ns, fileSys, file, replicas, null,
nodeOutofService)); nodeOutofService);
putNodeInService(0, nodeOutofService.getDatanodeUuid()); putNodeInService(0, nodeOutofService.getDatanodeUuid());
@ -387,8 +389,8 @@ private void testExpectedReplication(int replicationFactor,
// The block should be replicated to another datanode to meet // The block should be replicated to another datanode to meet
// expected replication count. // expected replication count.
assertNull(checkWithRetry(ns, fileSys, file, expectedReplicasInRead, checkWithRetry(ns, fileSys, file, expectedReplicasInRead,
nodeOutofService)); nodeOutofService);
cleanupFile(fileSys, file); cleanupFile(fileSys, file);
teardown(); teardown();
@ -548,19 +550,19 @@ public void testTransitionToDecommission() throws IOException {
client.datanodeReport(DatanodeReportType.LIVE).length); client.datanodeReport(DatanodeReportType.LIVE).length);
// test 1, verify the replica in IN_MAINTENANCE state isn't in LocatedBlock // test 1, verify the replica in IN_MAINTENANCE state isn't in LocatedBlock
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1, checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService)); nodeOutofService);
takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), 0, null, takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), 0, null,
AdminStates.DECOMMISSIONED); AdminStates.DECOMMISSIONED);
// test 2 after decommission has completed, the replication count is // test 2 after decommission has completed, the replication count is
// replicas + 1 which includes the decommissioned node. // replicas + 1 which includes the decommissioned node.
assertNull(checkWithRetry(ns, fileSys, file, replicas + 1, null)); checkWithRetry(ns, fileSys, file, replicas + 1, null);
// test 3, put the node in service, replication count should restore. // test 3, put the node in service, replication count should restore.
putNodeInService(0, nodeOutofService.getDatanodeUuid()); putNodeInService(0, nodeOutofService.getDatanodeUuid());
assertNull(checkWithRetry(ns, fileSys, file, replicas, null)); checkWithRetry(ns, fileSys, file, replicas, null);
cleanupFile(fileSys, file); cleanupFile(fileSys, file);
} }
@ -587,8 +589,8 @@ public void testTransitionFromDecommissioning() throws IOException {
takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), Long.MAX_VALUE, takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), Long.MAX_VALUE,
null, AdminStates.IN_MAINTENANCE); null, AdminStates.IN_MAINTENANCE);
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1, checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService)); nodeOutofService);
cleanupFile(fileSys, file); cleanupFile(fileSys, file);
} }
@ -631,10 +633,10 @@ private void testDecommissionDifferentNodeAfterMaintenance(int repl)
takeNodeOutofService(0, decommissionDNUuid, 0, null, maintenanceNodes, takeNodeOutofService(0, decommissionDNUuid, 0, null, maintenanceNodes,
AdminStates.DECOMMISSIONED); AdminStates.DECOMMISSIONED);
// Out of the replicas returned, one is the decommissioned node. // Out of the replicas returned, one is the decommissioned node.
assertNull(checkWithRetry(ns, fileSys, file, repl, maintenanceDN)); checkWithRetry(ns, fileSys, file, repl, maintenanceDN);
putNodeInService(0, maintenanceDN); putNodeInService(0, maintenanceDN);
assertNull(checkWithRetry(ns, fileSys, file, repl + 1, null)); checkWithRetry(ns, fileSys, file, repl + 1, null);
cleanupFile(fileSys, file); cleanupFile(fileSys, file);
teardown(); teardown();
@ -663,7 +665,7 @@ public void testMultipleNodesMaintenance() throws Exception {
AdminStates.IN_MAINTENANCE); AdminStates.IN_MAINTENANCE);
// Verify file replication matches maintenance state min replication // Verify file replication matches maintenance state min replication
assertNull(checkWithRetry(ns, fileSys, file, 1, null, nodes[0])); checkWithRetry(ns, fileSys, file, 1, null, nodes[0]);
// Put the maintenance nodes back in service // Put the maintenance nodes back in service
for (DatanodeInfo datanodeInfo : maintenanceDN) { for (DatanodeInfo datanodeInfo : maintenanceDN) {
@ -671,7 +673,7 @@ public void testMultipleNodesMaintenance() throws Exception {
} }
// Verify file replication catching up to the old state // Verify file replication catching up to the old state
assertNull(checkWithRetry(ns, fileSys, file, repl, null)); checkWithRetry(ns, fileSys, file, repl, null);
cleanupFile(fileSys, file); cleanupFile(fileSys, file);
} }
@ -720,19 +722,19 @@ private void testChangeReplicationFactor(int oldFactor, int newFactor,
// Verify that the nodeOutofService remains in blocksMap and // Verify that the nodeOutofService remains in blocksMap and
// # of live replicas For read operation is expected. // # of live replicas For read operation is expected.
assertNull(checkWithRetry(ns, fileSys, file, oldFactor - 1, checkWithRetry(ns, fileSys, file, oldFactor - 1,
nodeOutofService)); nodeOutofService);
final DFSClient client = getDfsClient(0); final DFSClient client = getDfsClient(0);
client.setReplication(file.toString(), (short)newFactor); client.setReplication(file.toString(), (short)newFactor);
// Verify that the nodeOutofService remains in blocksMap and // Verify that the nodeOutofService remains in blocksMap and
// # of live replicas for read operation. // # of live replicas for read operation.
assertNull(checkWithRetry(ns, fileSys, file, expectedLiveReplicas, checkWithRetry(ns, fileSys, file, expectedLiveReplicas,
nodeOutofService)); nodeOutofService);
putNodeInService(0, nodeOutofService.getDatanodeUuid()); putNodeInService(0, nodeOutofService.getDatanodeUuid());
assertNull(checkWithRetry(ns, fileSys, file, newFactor, null)); checkWithRetry(ns, fileSys, file, newFactor, null);
cleanupFile(fileSys, file); cleanupFile(fileSys, file);
teardown(); teardown();
@ -765,8 +767,8 @@ public void testTakeDeadNodeOutOfMaintenance() throws Exception {
getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null, getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
AdminStates.IN_MAINTENANCE); AdminStates.IN_MAINTENANCE);
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1, checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService)); nodeOutofService);
final DFSClient client = getDfsClient(0); final DFSClient client = getDfsClient(0);
assertEquals("All datanodes must be alive", numDatanodes, assertEquals("All datanodes must be alive", numDatanodes,
@ -779,16 +781,16 @@ public void testTakeDeadNodeOutOfMaintenance() throws Exception {
client.datanodeReport(DatanodeReportType.LIVE).length); client.datanodeReport(DatanodeReportType.LIVE).length);
// Dead maintenance node's blocks should remain in block map. // Dead maintenance node's blocks should remain in block map.
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1, checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService)); nodeOutofService);
// When dead maintenance mode is transitioned to out of maintenance mode, // When dead maintenance mode is transitioned to out of maintenance mode,
// its blocks should be removed from block map. // its blocks should be removed from block map.
// This will then trigger replication to restore the live replicas back // This will then trigger replication to restore the live replicas back
// to replication factor. // to replication factor.
putNodeInService(0, nodeOutofService.getDatanodeUuid()); putNodeInService(0, nodeOutofService.getDatanodeUuid());
assertNull(checkWithRetry(ns, fileSys, file, replicas, nodeOutofService, checkWithRetry(ns, fileSys, file, replicas, nodeOutofService,
null)); null);
cleanupFile(fileSys, file); cleanupFile(fileSys, file);
} }
@ -821,8 +823,8 @@ public void testWithNNAndDNRestart() throws Exception {
getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null, getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
AdminStates.IN_MAINTENANCE); AdminStates.IN_MAINTENANCE);
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1, checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService)); nodeOutofService);
DFSClient client = getDfsClient(0); DFSClient client = getDfsClient(0);
assertEquals("All datanodes must be alive", numDatanodes, assertEquals("All datanodes must be alive", numDatanodes,
@ -836,23 +838,23 @@ public void testWithNNAndDNRestart() throws Exception {
client.datanodeReport(DatanodeReportType.LIVE).length); client.datanodeReport(DatanodeReportType.LIVE).length);
// Dead maintenance node's blocks should remain in block map. // Dead maintenance node's blocks should remain in block map.
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1, checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService)); nodeOutofService);
// restart nn, nn will restore 3 live replicas given it doesn't // restart nn, nn will restore 3 live replicas given it doesn't
// know the maintenance node has the replica. // know the maintenance node has the replica.
getCluster().restartNameNode(0); getCluster().restartNameNode(0);
ns = getCluster().getNamesystem(0); ns = getCluster().getNamesystem(0);
assertNull(checkWithRetry(ns, fileSys, file, replicas, null)); checkWithRetry(ns, fileSys, file, replicas, null);
// restart dn, nn has 1 maintenance replica and 3 live replicas. // restart dn, nn has 1 maintenance replica and 3 live replicas.
getCluster().restartDataNode(dnProp, true); getCluster().restartDataNode(dnProp, true);
getCluster().waitActive(); getCluster().waitActive();
assertNull(checkWithRetry(ns, fileSys, file, replicas, nodeOutofService)); checkWithRetry(ns, fileSys, file, replicas, nodeOutofService);
// Put the node in service, a redundant replica should be removed. // Put the node in service, a redundant replica should be removed.
putNodeInService(0, nodeOutofService.getDatanodeUuid()); putNodeInService(0, nodeOutofService.getDatanodeUuid());
assertNull(checkWithRetry(ns, fileSys, file, replicas, null)); checkWithRetry(ns, fileSys, file, replicas, null);
cleanupFile(fileSys, file); cleanupFile(fileSys, file);
} }
@ -878,12 +880,12 @@ public void testWriteAfterMaintenance() throws IOException {
writeFile(fileSys, file, replicas, 2); writeFile(fileSys, file, replicas, 2);
// Verify nodeOutofService wasn't chosen for write operation. // Verify nodeOutofService wasn't chosen for write operation.
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1, checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService, null)); nodeOutofService, null);
// Put the node back to service, live replicas should be restored. // Put the node back to service, live replicas should be restored.
putNodeInService(0, nodeOutofService.getDatanodeUuid()); putNodeInService(0, nodeOutofService.getDatanodeUuid());
assertNull(checkWithRetry(ns, fileSys, file, replicas, null)); checkWithRetry(ns, fileSys, file, replicas, null);
cleanupFile(fileSys, file); cleanupFile(fileSys, file);
} }
@ -934,12 +936,12 @@ public void testInvalidation() throws IOException {
client.setReplication(file.toString(), (short) 1); client.setReplication(file.toString(), (short) 1);
// Verify the nodeOutofService remains in blocksMap. // Verify the nodeOutofService remains in blocksMap.
assertNull(checkWithRetry(ns, fileSys, file, 1, nodeOutofService)); checkWithRetry(ns, fileSys, file, 1, nodeOutofService);
// Restart NN and verify the nodeOutofService remains in blocksMap. // Restart NN and verify the nodeOutofService remains in blocksMap.
getCluster().restartNameNode(0); getCluster().restartNameNode(0);
ns = getCluster().getNamesystem(0); ns = getCluster().getNamesystem(0);
assertNull(checkWithRetry(ns, fileSys, file, 1, nodeOutofService)); checkWithRetry(ns, fileSys, file, 1, nodeOutofService);
cleanupFile(fileSys, file); cleanupFile(fileSys, file);
} }
@ -1081,30 +1083,32 @@ static String checkFile(FSNamesystem ns, FileSystem fileSys,
return null; return null;
} }
static String checkWithRetry(FSNamesystem ns, FileSystem fileSys, static void checkWithRetry(FSNamesystem ns, FileSystem fileSys, Path name,
Path name, int repl, DatanodeInfo inMaintenanceNode) int repl, DatanodeInfo inMaintenanceNode) {
throws IOException { checkWithRetry(ns, fileSys, name, repl, inMaintenanceNode,
return checkWithRetry(ns, fileSys, name, repl, inMaintenanceNode,
inMaintenanceNode); inMaintenanceNode);
} }
static String checkWithRetry(FSNamesystem ns, FileSystem fileSys, static void checkWithRetry(final FSNamesystem ns, final FileSystem fileSys,
Path name, int repl, DatanodeInfo excludedNode, final Path name, final int repl, final DatanodeInfo excludedNode,
DatanodeInfo underMaintenanceNode) throws IOException { final DatanodeInfo underMaintenanceNode) {
int tries = 0; try {
String output = null; GenericTestUtils.waitFor(new Supplier<Boolean>() {
while (tries++ < 200) {
try { @Override
Thread.sleep(100); public Boolean get() {
output = checkFile(ns, fileSys, name, repl, excludedNode, String output = null;
underMaintenanceNode); try {
if (output == null) { output = checkFile(ns, fileSys, name, repl, excludedNode,
break; underMaintenanceNode);
} catch (Exception ignored) {
}
return (output == null);
} }
} catch (InterruptedException ie) { }, 100, 60000);
} } catch (Exception ignored) {
} }
return output;
} }
static private DatanodeInfo[] getFirstBlockReplicasDatanodeInfos( static private DatanodeInfo[] getFirstBlockReplicasDatanodeInfos(