HDFS-11446. TestMaintenanceState#testWithNNAndDNRestart fails intermittently. Contributed by Yiqun Lin.

This commit is contained in:
Yiqun Lin 2017-05-28 11:23:32 +08:00
parent 89bb8bfe58
commit 31058b243e

View File

@ -18,7 +18,6 @@
package org.apache.hadoop.hdfs;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
@ -30,12 +29,7 @@
import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.junit.Assert;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
@ -48,8 +42,16 @@
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.util.Time;
import org.junit.Assert;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Supplier;
import com.google.common.collect.Lists;
/**
* This class tests node maintenance.
@ -125,8 +127,8 @@ public void testTakeNodeOutOfEnteringMaintenance() throws Exception {
// When node is in ENTERING_MAINTENANCE state, it can still serve read
// requests
assertNull(checkWithRetry(ns, fileSys, file, replicas, null,
nodeOutofService));
checkWithRetry(ns, fileSys, file, replicas, null,
nodeOutofService);
putNodeInService(0, nodeOutofService.getDatanodeUuid());
@ -387,8 +389,8 @@ private void testExpectedReplication(int replicationFactor,
// The block should be replicated to another datanode to meet
// expected replication count.
assertNull(checkWithRetry(ns, fileSys, file, expectedReplicasInRead,
nodeOutofService));
checkWithRetry(ns, fileSys, file, expectedReplicasInRead,
nodeOutofService);
cleanupFile(fileSys, file);
teardown();
@ -548,19 +550,19 @@ public void testTransitionToDecommission() throws IOException {
client.datanodeReport(DatanodeReportType.LIVE).length);
// test 1, verify the replica in IN_MAINTENANCE state isn't in LocatedBlock
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService));
checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService);
takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), 0, null,
AdminStates.DECOMMISSIONED);
// test 2 after decommission has completed, the replication count is
// replicas + 1 which includes the decommissioned node.
assertNull(checkWithRetry(ns, fileSys, file, replicas + 1, null));
checkWithRetry(ns, fileSys, file, replicas + 1, null);
// test 3, put the node in service, replication count should restore.
putNodeInService(0, nodeOutofService.getDatanodeUuid());
assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
checkWithRetry(ns, fileSys, file, replicas, null);
cleanupFile(fileSys, file);
}
@ -587,8 +589,8 @@ public void testTransitionFromDecommissioning() throws IOException {
takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), Long.MAX_VALUE,
null, AdminStates.IN_MAINTENANCE);
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService));
checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService);
cleanupFile(fileSys, file);
}
@ -631,10 +633,10 @@ private void testDecommissionDifferentNodeAfterMaintenance(int repl)
takeNodeOutofService(0, decommissionDNUuid, 0, null, maintenanceNodes,
AdminStates.DECOMMISSIONED);
// Out of the replicas returned, one is the decommissioned node.
assertNull(checkWithRetry(ns, fileSys, file, repl, maintenanceDN));
checkWithRetry(ns, fileSys, file, repl, maintenanceDN);
putNodeInService(0, maintenanceDN);
assertNull(checkWithRetry(ns, fileSys, file, repl + 1, null));
checkWithRetry(ns, fileSys, file, repl + 1, null);
cleanupFile(fileSys, file);
teardown();
@ -663,7 +665,7 @@ public void testMultipleNodesMaintenance() throws Exception {
AdminStates.IN_MAINTENANCE);
// Verify file replication matches maintenance state min replication
assertNull(checkWithRetry(ns, fileSys, file, 1, null, nodes[0]));
checkWithRetry(ns, fileSys, file, 1, null, nodes[0]);
// Put the maintenance nodes back in service
for (DatanodeInfo datanodeInfo : maintenanceDN) {
@ -671,7 +673,7 @@ public void testMultipleNodesMaintenance() throws Exception {
}
// Verify file replication catching up to the old state
assertNull(checkWithRetry(ns, fileSys, file, repl, null));
checkWithRetry(ns, fileSys, file, repl, null);
cleanupFile(fileSys, file);
}
@ -720,19 +722,19 @@ private void testChangeReplicationFactor(int oldFactor, int newFactor,
// Verify that the nodeOutofService remains in blocksMap and
// # of live replicas For read operation is expected.
assertNull(checkWithRetry(ns, fileSys, file, oldFactor - 1,
nodeOutofService));
checkWithRetry(ns, fileSys, file, oldFactor - 1,
nodeOutofService);
final DFSClient client = getDfsClient(0);
client.setReplication(file.toString(), (short)newFactor);
// Verify that the nodeOutofService remains in blocksMap and
// # of live replicas for read operation.
assertNull(checkWithRetry(ns, fileSys, file, expectedLiveReplicas,
nodeOutofService));
checkWithRetry(ns, fileSys, file, expectedLiveReplicas,
nodeOutofService);
putNodeInService(0, nodeOutofService.getDatanodeUuid());
assertNull(checkWithRetry(ns, fileSys, file, newFactor, null));
checkWithRetry(ns, fileSys, file, newFactor, null);
cleanupFile(fileSys, file);
teardown();
@ -765,8 +767,8 @@ public void testTakeDeadNodeOutOfMaintenance() throws Exception {
getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
AdminStates.IN_MAINTENANCE);
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService));
checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService);
final DFSClient client = getDfsClient(0);
assertEquals("All datanodes must be alive", numDatanodes,
@ -779,16 +781,16 @@ public void testTakeDeadNodeOutOfMaintenance() throws Exception {
client.datanodeReport(DatanodeReportType.LIVE).length);
// Dead maintenance node's blocks should remain in block map.
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService));
checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService);
// When dead maintenance mode is transitioned to out of maintenance mode,
// its blocks should be removed from block map.
// This will then trigger replication to restore the live replicas back
// to replication factor.
putNodeInService(0, nodeOutofService.getDatanodeUuid());
assertNull(checkWithRetry(ns, fileSys, file, replicas, nodeOutofService,
null));
checkWithRetry(ns, fileSys, file, replicas, nodeOutofService,
null);
cleanupFile(fileSys, file);
}
@ -821,8 +823,8 @@ public void testWithNNAndDNRestart() throws Exception {
getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
AdminStates.IN_MAINTENANCE);
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService));
checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService);
DFSClient client = getDfsClient(0);
assertEquals("All datanodes must be alive", numDatanodes,
@ -836,23 +838,23 @@ public void testWithNNAndDNRestart() throws Exception {
client.datanodeReport(DatanodeReportType.LIVE).length);
// Dead maintenance node's blocks should remain in block map.
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService));
checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService);
// restart nn, nn will restore 3 live replicas given it doesn't
// know the maintenance node has the replica.
getCluster().restartNameNode(0);
ns = getCluster().getNamesystem(0);
assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
checkWithRetry(ns, fileSys, file, replicas, null);
// restart dn, nn has 1 maintenance replica and 3 live replicas.
getCluster().restartDataNode(dnProp, true);
getCluster().waitActive();
assertNull(checkWithRetry(ns, fileSys, file, replicas, nodeOutofService));
checkWithRetry(ns, fileSys, file, replicas, nodeOutofService);
// Put the node in service, a redundant replica should be removed.
putNodeInService(0, nodeOutofService.getDatanodeUuid());
assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
checkWithRetry(ns, fileSys, file, replicas, null);
cleanupFile(fileSys, file);
}
@ -878,12 +880,12 @@ public void testWriteAfterMaintenance() throws IOException {
writeFile(fileSys, file, replicas, 2);
// Verify nodeOutofService wasn't chosen for write operation.
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService, null));
checkWithRetry(ns, fileSys, file, replicas - 1,
nodeOutofService, null);
// Put the node back to service, live replicas should be restored.
putNodeInService(0, nodeOutofService.getDatanodeUuid());
assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
checkWithRetry(ns, fileSys, file, replicas, null);
cleanupFile(fileSys, file);
}
@ -934,12 +936,12 @@ public void testInvalidation() throws IOException {
client.setReplication(file.toString(), (short) 1);
// Verify the nodeOutofService remains in blocksMap.
assertNull(checkWithRetry(ns, fileSys, file, 1, nodeOutofService));
checkWithRetry(ns, fileSys, file, 1, nodeOutofService);
// Restart NN and verify the nodeOutofService remains in blocksMap.
getCluster().restartNameNode(0);
ns = getCluster().getNamesystem(0);
assertNull(checkWithRetry(ns, fileSys, file, 1, nodeOutofService));
checkWithRetry(ns, fileSys, file, 1, nodeOutofService);
cleanupFile(fileSys, file);
}
@ -1081,30 +1083,32 @@ static String checkFile(FSNamesystem ns, FileSystem fileSys,
return null;
}
static String checkWithRetry(FSNamesystem ns, FileSystem fileSys,
Path name, int repl, DatanodeInfo inMaintenanceNode)
throws IOException {
return checkWithRetry(ns, fileSys, name, repl, inMaintenanceNode,
static void checkWithRetry(FSNamesystem ns, FileSystem fileSys, Path name,
int repl, DatanodeInfo inMaintenanceNode) {
checkWithRetry(ns, fileSys, name, repl, inMaintenanceNode,
inMaintenanceNode);
}
static String checkWithRetry(FSNamesystem ns, FileSystem fileSys,
Path name, int repl, DatanodeInfo excludedNode,
DatanodeInfo underMaintenanceNode) throws IOException {
int tries = 0;
String output = null;
while (tries++ < 200) {
try {
Thread.sleep(100);
output = checkFile(ns, fileSys, name, repl, excludedNode,
underMaintenanceNode);
if (output == null) {
break;
static void checkWithRetry(final FSNamesystem ns, final FileSystem fileSys,
final Path name, final int repl, final DatanodeInfo excludedNode,
final DatanodeInfo underMaintenanceNode) {
try {
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
String output = null;
try {
output = checkFile(ns, fileSys, name, repl, excludedNode,
underMaintenanceNode);
} catch (Exception ignored) {
}
return (output == null);
}
} catch (InterruptedException ie) {
}
}, 100, 60000);
} catch (Exception ignored) {
}
return output;
}
static private DatanodeInfo[] getFirstBlockReplicasDatanodeInfos(