HDFS-16373. Fix MiniDFSCluster restart in case of multiple namenodes. (#3756)
Reviewed-by: Viraj Jasani <vjasani@apache.org> Reviewed-by: litao <tomleescut@gmail.com> Signed-off-by: Takanobu Asanuma <tasanuma@apache.org>
This commit is contained in:
parent
c56a07f36b
commit
d29f0e83a9
@ -2267,9 +2267,11 @@ public synchronized void restartNameNode(int nnIndex, boolean waitActive,
|
||||
info.nameNode = nn;
|
||||
info.setStartOpt(startOpt);
|
||||
if (waitActive) {
|
||||
waitClusterUp();
|
||||
if (numDataNodes > 0) {
|
||||
waitNameNodeUp(nnIndex);
|
||||
}
|
||||
LOG.info("Restarted the namenode");
|
||||
waitActive();
|
||||
waitActive(nnIndex);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2775,11 +2777,25 @@ public void waitActive(int nnIndex) throws IOException {
|
||||
DFSClient client = new DFSClient(addr, conf);
|
||||
|
||||
// ensure all datanodes have registered and sent heartbeat to the namenode
|
||||
while (shouldWait(client.datanodeReport(DatanodeReportType.LIVE), addr)) {
|
||||
int failedCount = 0;
|
||||
while (true) {
|
||||
try {
|
||||
LOG.info("Waiting for cluster to become active");
|
||||
Thread.sleep(100);
|
||||
while (shouldWait(client.datanodeReport(DatanodeReportType.LIVE), addr)) {
|
||||
LOG.info("Waiting for cluster to become active");
|
||||
Thread.sleep(100);
|
||||
}
|
||||
break;
|
||||
} catch (IOException e) {
|
||||
failedCount++;
|
||||
// Cached RPC connection to namenode, if any, is expected to fail once
|
||||
if (failedCount > 1) {
|
||||
LOG.warn("Tried waitActive() " + failedCount
|
||||
+ " time(s) and failed, giving up. " + StringUtils
|
||||
.stringifyException(e));
|
||||
throw e;
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2815,22 +2831,7 @@ public Boolean get() {
|
||||
*/
|
||||
public void waitActive() throws IOException {
|
||||
for (int index = 0; index < namenodes.size(); index++) {
|
||||
int failedCount = 0;
|
||||
while (true) {
|
||||
try {
|
||||
waitActive(index);
|
||||
break;
|
||||
} catch (IOException e) {
|
||||
failedCount++;
|
||||
// Cached RPC connection to namenode, if any, is expected to fail once
|
||||
if (failedCount > 1) {
|
||||
LOG.warn("Tried waitActive() " + failedCount
|
||||
+ " time(s) and failed, giving up. "
|
||||
+ StringUtils.stringifyException(e));
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
waitActive(index);
|
||||
}
|
||||
LOG.info("Cluster is active");
|
||||
}
|
||||
|
@ -309,6 +309,14 @@ public void testSetUpFederatedCluster() throws Exception {
|
||||
DFSUtil.addKeySuffixes(
|
||||
DFS_NAMENODE_HTTP_ADDRESS_KEY, "ns1", "nn1")));
|
||||
}
|
||||
|
||||
// Shutdown namenodes individually.
|
||||
cluster.shutdownNameNode(0);
|
||||
cluster.shutdownNameNode(1);
|
||||
|
||||
// Restart namenodes individually with wait active, both should be successful.
|
||||
cluster.restartNameNode(0);
|
||||
cluster.restartNameNode(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user