HDFS-11446. TestMaintenanceState#testWithNNAndDNRestart fails intermittently. Contributed by Yiqun Lin.
This commit is contained in:
parent
89bb8bfe58
commit
31058b243e
@ -18,7 +18,6 @@
|
|||||||
package org.apache.hadoop.hdfs;
|
package org.apache.hadoop.hdfs;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertNull;
|
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
import static org.junit.Assert.fail;
|
import static org.junit.Assert.fail;
|
||||||
|
|
||||||
@ -30,12 +29,7 @@
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
|
||||||
import org.junit.Assert;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
|
import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
|
||||||
@ -48,8 +42,16 @@
|
|||||||
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
|
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||||
|
import org.apache.hadoop.test.GenericTestUtils;
|
||||||
import org.apache.hadoop.util.Time;
|
import org.apache.hadoop.util.Time;
|
||||||
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.common.base.Supplier;
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class tests node maintenance.
|
* This class tests node maintenance.
|
||||||
@ -125,8 +127,8 @@ public void testTakeNodeOutOfEnteringMaintenance() throws Exception {
|
|||||||
|
|
||||||
// When node is in ENTERING_MAINTENANCE state, it can still serve read
|
// When node is in ENTERING_MAINTENANCE state, it can still serve read
|
||||||
// requests
|
// requests
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas, null,
|
checkWithRetry(ns, fileSys, file, replicas, null,
|
||||||
nodeOutofService));
|
nodeOutofService);
|
||||||
|
|
||||||
putNodeInService(0, nodeOutofService.getDatanodeUuid());
|
putNodeInService(0, nodeOutofService.getDatanodeUuid());
|
||||||
|
|
||||||
@ -387,8 +389,8 @@ private void testExpectedReplication(int replicationFactor,
|
|||||||
|
|
||||||
// The block should be replicated to another datanode to meet
|
// The block should be replicated to another datanode to meet
|
||||||
// expected replication count.
|
// expected replication count.
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, expectedReplicasInRead,
|
checkWithRetry(ns, fileSys, file, expectedReplicasInRead,
|
||||||
nodeOutofService));
|
nodeOutofService);
|
||||||
|
|
||||||
cleanupFile(fileSys, file);
|
cleanupFile(fileSys, file);
|
||||||
teardown();
|
teardown();
|
||||||
@ -548,19 +550,19 @@ public void testTransitionToDecommission() throws IOException {
|
|||||||
client.datanodeReport(DatanodeReportType.LIVE).length);
|
client.datanodeReport(DatanodeReportType.LIVE).length);
|
||||||
|
|
||||||
// test 1, verify the replica in IN_MAINTENANCE state isn't in LocatedBlock
|
// test 1, verify the replica in IN_MAINTENANCE state isn't in LocatedBlock
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
|
checkWithRetry(ns, fileSys, file, replicas - 1,
|
||||||
nodeOutofService));
|
nodeOutofService);
|
||||||
|
|
||||||
takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), 0, null,
|
takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), 0, null,
|
||||||
AdminStates.DECOMMISSIONED);
|
AdminStates.DECOMMISSIONED);
|
||||||
|
|
||||||
// test 2 after decommission has completed, the replication count is
|
// test 2 after decommission has completed, the replication count is
|
||||||
// replicas + 1 which includes the decommissioned node.
|
// replicas + 1 which includes the decommissioned node.
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas + 1, null));
|
checkWithRetry(ns, fileSys, file, replicas + 1, null);
|
||||||
|
|
||||||
// test 3, put the node in service, replication count should restore.
|
// test 3, put the node in service, replication count should restore.
|
||||||
putNodeInService(0, nodeOutofService.getDatanodeUuid());
|
putNodeInService(0, nodeOutofService.getDatanodeUuid());
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
|
checkWithRetry(ns, fileSys, file, replicas, null);
|
||||||
|
|
||||||
cleanupFile(fileSys, file);
|
cleanupFile(fileSys, file);
|
||||||
}
|
}
|
||||||
@ -587,8 +589,8 @@ public void testTransitionFromDecommissioning() throws IOException {
|
|||||||
takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), Long.MAX_VALUE,
|
takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), Long.MAX_VALUE,
|
||||||
null, AdminStates.IN_MAINTENANCE);
|
null, AdminStates.IN_MAINTENANCE);
|
||||||
|
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
|
checkWithRetry(ns, fileSys, file, replicas - 1,
|
||||||
nodeOutofService));
|
nodeOutofService);
|
||||||
|
|
||||||
cleanupFile(fileSys, file);
|
cleanupFile(fileSys, file);
|
||||||
}
|
}
|
||||||
@ -631,10 +633,10 @@ private void testDecommissionDifferentNodeAfterMaintenance(int repl)
|
|||||||
takeNodeOutofService(0, decommissionDNUuid, 0, null, maintenanceNodes,
|
takeNodeOutofService(0, decommissionDNUuid, 0, null, maintenanceNodes,
|
||||||
AdminStates.DECOMMISSIONED);
|
AdminStates.DECOMMISSIONED);
|
||||||
// Out of the replicas returned, one is the decommissioned node.
|
// Out of the replicas returned, one is the decommissioned node.
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, repl, maintenanceDN));
|
checkWithRetry(ns, fileSys, file, repl, maintenanceDN);
|
||||||
|
|
||||||
putNodeInService(0, maintenanceDN);
|
putNodeInService(0, maintenanceDN);
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, repl + 1, null));
|
checkWithRetry(ns, fileSys, file, repl + 1, null);
|
||||||
|
|
||||||
cleanupFile(fileSys, file);
|
cleanupFile(fileSys, file);
|
||||||
teardown();
|
teardown();
|
||||||
@ -663,7 +665,7 @@ public void testMultipleNodesMaintenance() throws Exception {
|
|||||||
AdminStates.IN_MAINTENANCE);
|
AdminStates.IN_MAINTENANCE);
|
||||||
|
|
||||||
// Verify file replication matches maintenance state min replication
|
// Verify file replication matches maintenance state min replication
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, 1, null, nodes[0]));
|
checkWithRetry(ns, fileSys, file, 1, null, nodes[0]);
|
||||||
|
|
||||||
// Put the maintenance nodes back in service
|
// Put the maintenance nodes back in service
|
||||||
for (DatanodeInfo datanodeInfo : maintenanceDN) {
|
for (DatanodeInfo datanodeInfo : maintenanceDN) {
|
||||||
@ -671,7 +673,7 @@ public void testMultipleNodesMaintenance() throws Exception {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Verify file replication catching up to the old state
|
// Verify file replication catching up to the old state
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, repl, null));
|
checkWithRetry(ns, fileSys, file, repl, null);
|
||||||
|
|
||||||
cleanupFile(fileSys, file);
|
cleanupFile(fileSys, file);
|
||||||
}
|
}
|
||||||
@ -720,19 +722,19 @@ private void testChangeReplicationFactor(int oldFactor, int newFactor,
|
|||||||
|
|
||||||
// Verify that the nodeOutofService remains in blocksMap and
|
// Verify that the nodeOutofService remains in blocksMap and
|
||||||
// # of live replicas For read operation is expected.
|
// # of live replicas For read operation is expected.
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, oldFactor - 1,
|
checkWithRetry(ns, fileSys, file, oldFactor - 1,
|
||||||
nodeOutofService));
|
nodeOutofService);
|
||||||
|
|
||||||
final DFSClient client = getDfsClient(0);
|
final DFSClient client = getDfsClient(0);
|
||||||
client.setReplication(file.toString(), (short)newFactor);
|
client.setReplication(file.toString(), (short)newFactor);
|
||||||
|
|
||||||
// Verify that the nodeOutofService remains in blocksMap and
|
// Verify that the nodeOutofService remains in blocksMap and
|
||||||
// # of live replicas for read operation.
|
// # of live replicas for read operation.
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, expectedLiveReplicas,
|
checkWithRetry(ns, fileSys, file, expectedLiveReplicas,
|
||||||
nodeOutofService));
|
nodeOutofService);
|
||||||
|
|
||||||
putNodeInService(0, nodeOutofService.getDatanodeUuid());
|
putNodeInService(0, nodeOutofService.getDatanodeUuid());
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, newFactor, null));
|
checkWithRetry(ns, fileSys, file, newFactor, null);
|
||||||
|
|
||||||
cleanupFile(fileSys, file);
|
cleanupFile(fileSys, file);
|
||||||
teardown();
|
teardown();
|
||||||
@ -765,8 +767,8 @@ public void testTakeDeadNodeOutOfMaintenance() throws Exception {
|
|||||||
getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
|
getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
|
||||||
AdminStates.IN_MAINTENANCE);
|
AdminStates.IN_MAINTENANCE);
|
||||||
|
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
|
checkWithRetry(ns, fileSys, file, replicas - 1,
|
||||||
nodeOutofService));
|
nodeOutofService);
|
||||||
|
|
||||||
final DFSClient client = getDfsClient(0);
|
final DFSClient client = getDfsClient(0);
|
||||||
assertEquals("All datanodes must be alive", numDatanodes,
|
assertEquals("All datanodes must be alive", numDatanodes,
|
||||||
@ -779,16 +781,16 @@ public void testTakeDeadNodeOutOfMaintenance() throws Exception {
|
|||||||
client.datanodeReport(DatanodeReportType.LIVE).length);
|
client.datanodeReport(DatanodeReportType.LIVE).length);
|
||||||
|
|
||||||
// Dead maintenance node's blocks should remain in block map.
|
// Dead maintenance node's blocks should remain in block map.
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
|
checkWithRetry(ns, fileSys, file, replicas - 1,
|
||||||
nodeOutofService));
|
nodeOutofService);
|
||||||
|
|
||||||
// When dead maintenance mode is transitioned to out of maintenance mode,
|
// When dead maintenance mode is transitioned to out of maintenance mode,
|
||||||
// its blocks should be removed from block map.
|
// its blocks should be removed from block map.
|
||||||
// This will then trigger replication to restore the live replicas back
|
// This will then trigger replication to restore the live replicas back
|
||||||
// to replication factor.
|
// to replication factor.
|
||||||
putNodeInService(0, nodeOutofService.getDatanodeUuid());
|
putNodeInService(0, nodeOutofService.getDatanodeUuid());
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas, nodeOutofService,
|
checkWithRetry(ns, fileSys, file, replicas, nodeOutofService,
|
||||||
null));
|
null);
|
||||||
|
|
||||||
cleanupFile(fileSys, file);
|
cleanupFile(fileSys, file);
|
||||||
}
|
}
|
||||||
@ -821,8 +823,8 @@ public void testWithNNAndDNRestart() throws Exception {
|
|||||||
getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
|
getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
|
||||||
AdminStates.IN_MAINTENANCE);
|
AdminStates.IN_MAINTENANCE);
|
||||||
|
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
|
checkWithRetry(ns, fileSys, file, replicas - 1,
|
||||||
nodeOutofService));
|
nodeOutofService);
|
||||||
|
|
||||||
DFSClient client = getDfsClient(0);
|
DFSClient client = getDfsClient(0);
|
||||||
assertEquals("All datanodes must be alive", numDatanodes,
|
assertEquals("All datanodes must be alive", numDatanodes,
|
||||||
@ -836,23 +838,23 @@ public void testWithNNAndDNRestart() throws Exception {
|
|||||||
client.datanodeReport(DatanodeReportType.LIVE).length);
|
client.datanodeReport(DatanodeReportType.LIVE).length);
|
||||||
|
|
||||||
// Dead maintenance node's blocks should remain in block map.
|
// Dead maintenance node's blocks should remain in block map.
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
|
checkWithRetry(ns, fileSys, file, replicas - 1,
|
||||||
nodeOutofService));
|
nodeOutofService);
|
||||||
|
|
||||||
// restart nn, nn will restore 3 live replicas given it doesn't
|
// restart nn, nn will restore 3 live replicas given it doesn't
|
||||||
// know the maintenance node has the replica.
|
// know the maintenance node has the replica.
|
||||||
getCluster().restartNameNode(0);
|
getCluster().restartNameNode(0);
|
||||||
ns = getCluster().getNamesystem(0);
|
ns = getCluster().getNamesystem(0);
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
|
checkWithRetry(ns, fileSys, file, replicas, null);
|
||||||
|
|
||||||
// restart dn, nn has 1 maintenance replica and 3 live replicas.
|
// restart dn, nn has 1 maintenance replica and 3 live replicas.
|
||||||
getCluster().restartDataNode(dnProp, true);
|
getCluster().restartDataNode(dnProp, true);
|
||||||
getCluster().waitActive();
|
getCluster().waitActive();
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas, nodeOutofService));
|
checkWithRetry(ns, fileSys, file, replicas, nodeOutofService);
|
||||||
|
|
||||||
// Put the node in service, a redundant replica should be removed.
|
// Put the node in service, a redundant replica should be removed.
|
||||||
putNodeInService(0, nodeOutofService.getDatanodeUuid());
|
putNodeInService(0, nodeOutofService.getDatanodeUuid());
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
|
checkWithRetry(ns, fileSys, file, replicas, null);
|
||||||
|
|
||||||
cleanupFile(fileSys, file);
|
cleanupFile(fileSys, file);
|
||||||
}
|
}
|
||||||
@ -878,12 +880,12 @@ public void testWriteAfterMaintenance() throws IOException {
|
|||||||
writeFile(fileSys, file, replicas, 2);
|
writeFile(fileSys, file, replicas, 2);
|
||||||
|
|
||||||
// Verify nodeOutofService wasn't chosen for write operation.
|
// Verify nodeOutofService wasn't chosen for write operation.
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
|
checkWithRetry(ns, fileSys, file, replicas - 1,
|
||||||
nodeOutofService, null));
|
nodeOutofService, null);
|
||||||
|
|
||||||
// Put the node back to service, live replicas should be restored.
|
// Put the node back to service, live replicas should be restored.
|
||||||
putNodeInService(0, nodeOutofService.getDatanodeUuid());
|
putNodeInService(0, nodeOutofService.getDatanodeUuid());
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
|
checkWithRetry(ns, fileSys, file, replicas, null);
|
||||||
|
|
||||||
cleanupFile(fileSys, file);
|
cleanupFile(fileSys, file);
|
||||||
}
|
}
|
||||||
@ -934,12 +936,12 @@ public void testInvalidation() throws IOException {
|
|||||||
client.setReplication(file.toString(), (short) 1);
|
client.setReplication(file.toString(), (short) 1);
|
||||||
|
|
||||||
// Verify the nodeOutofService remains in blocksMap.
|
// Verify the nodeOutofService remains in blocksMap.
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, 1, nodeOutofService));
|
checkWithRetry(ns, fileSys, file, 1, nodeOutofService);
|
||||||
|
|
||||||
// Restart NN and verify the nodeOutofService remains in blocksMap.
|
// Restart NN and verify the nodeOutofService remains in blocksMap.
|
||||||
getCluster().restartNameNode(0);
|
getCluster().restartNameNode(0);
|
||||||
ns = getCluster().getNamesystem(0);
|
ns = getCluster().getNamesystem(0);
|
||||||
assertNull(checkWithRetry(ns, fileSys, file, 1, nodeOutofService));
|
checkWithRetry(ns, fileSys, file, 1, nodeOutofService);
|
||||||
|
|
||||||
cleanupFile(fileSys, file);
|
cleanupFile(fileSys, file);
|
||||||
}
|
}
|
||||||
@ -1081,30 +1083,32 @@ static String checkFile(FSNamesystem ns, FileSystem fileSys,
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
static String checkWithRetry(FSNamesystem ns, FileSystem fileSys,
|
static void checkWithRetry(FSNamesystem ns, FileSystem fileSys, Path name,
|
||||||
Path name, int repl, DatanodeInfo inMaintenanceNode)
|
int repl, DatanodeInfo inMaintenanceNode) {
|
||||||
throws IOException {
|
checkWithRetry(ns, fileSys, name, repl, inMaintenanceNode,
|
||||||
return checkWithRetry(ns, fileSys, name, repl, inMaintenanceNode,
|
|
||||||
inMaintenanceNode);
|
inMaintenanceNode);
|
||||||
}
|
}
|
||||||
|
|
||||||
static String checkWithRetry(FSNamesystem ns, FileSystem fileSys,
|
static void checkWithRetry(final FSNamesystem ns, final FileSystem fileSys,
|
||||||
Path name, int repl, DatanodeInfo excludedNode,
|
final Path name, final int repl, final DatanodeInfo excludedNode,
|
||||||
DatanodeInfo underMaintenanceNode) throws IOException {
|
final DatanodeInfo underMaintenanceNode) {
|
||||||
int tries = 0;
|
try {
|
||||||
String output = null;
|
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||||
while (tries++ < 200) {
|
|
||||||
try {
|
@Override
|
||||||
Thread.sleep(100);
|
public Boolean get() {
|
||||||
output = checkFile(ns, fileSys, name, repl, excludedNode,
|
String output = null;
|
||||||
underMaintenanceNode);
|
try {
|
||||||
if (output == null) {
|
output = checkFile(ns, fileSys, name, repl, excludedNode,
|
||||||
break;
|
underMaintenanceNode);
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
}
|
||||||
|
|
||||||
|
return (output == null);
|
||||||
}
|
}
|
||||||
} catch (InterruptedException ie) {
|
}, 100, 60000);
|
||||||
}
|
} catch (Exception ignored) {
|
||||||
}
|
}
|
||||||
return output;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static private DatanodeInfo[] getFirstBlockReplicasDatanodeInfos(
|
static private DatanodeInfo[] getFirstBlockReplicasDatanodeInfos(
|
||||||
|
Loading…
Reference in New Issue
Block a user