HDFS-16373. Fix MiniDFSCluster restart in case of multiple namenodes. (#3756)
Reviewed-by: Viraj Jasani <vjasani@apache.org> Reviewed-by: litao <tomleescut@gmail.com> Signed-off-by: Takanobu Asanuma <tasanuma@apache.org>
This commit is contained in:
parent
c56a07f36b
commit
d29f0e83a9
@ -2267,9 +2267,11 @@ public synchronized void restartNameNode(int nnIndex, boolean waitActive,
|
|||||||
info.nameNode = nn;
|
info.nameNode = nn;
|
||||||
info.setStartOpt(startOpt);
|
info.setStartOpt(startOpt);
|
||||||
if (waitActive) {
|
if (waitActive) {
|
||||||
waitClusterUp();
|
if (numDataNodes > 0) {
|
||||||
|
waitNameNodeUp(nnIndex);
|
||||||
|
}
|
||||||
LOG.info("Restarted the namenode");
|
LOG.info("Restarted the namenode");
|
||||||
waitActive();
|
waitActive(nnIndex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2775,11 +2777,25 @@ public void waitActive(int nnIndex) throws IOException {
|
|||||||
DFSClient client = new DFSClient(addr, conf);
|
DFSClient client = new DFSClient(addr, conf);
|
||||||
|
|
||||||
// ensure all datanodes have registered and sent heartbeat to the namenode
|
// ensure all datanodes have registered and sent heartbeat to the namenode
|
||||||
while (shouldWait(client.datanodeReport(DatanodeReportType.LIVE), addr)) {
|
int failedCount = 0;
|
||||||
|
while (true) {
|
||||||
try {
|
try {
|
||||||
LOG.info("Waiting for cluster to become active");
|
while (shouldWait(client.datanodeReport(DatanodeReportType.LIVE), addr)) {
|
||||||
Thread.sleep(100);
|
LOG.info("Waiting for cluster to become active");
|
||||||
|
Thread.sleep(100);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
} catch (IOException e) {
|
||||||
|
failedCount++;
|
||||||
|
// Cached RPC connection to namenode, if any, is expected to fail once
|
||||||
|
if (failedCount > 1) {
|
||||||
|
LOG.warn("Tried waitActive() " + failedCount
|
||||||
|
+ " time(s) and failed, giving up. " + StringUtils
|
||||||
|
.stringifyException(e));
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
|
throw new IOException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2815,22 +2831,7 @@ public Boolean get() {
|
|||||||
*/
|
*/
|
||||||
public void waitActive() throws IOException {
|
public void waitActive() throws IOException {
|
||||||
for (int index = 0; index < namenodes.size(); index++) {
|
for (int index = 0; index < namenodes.size(); index++) {
|
||||||
int failedCount = 0;
|
waitActive(index);
|
||||||
while (true) {
|
|
||||||
try {
|
|
||||||
waitActive(index);
|
|
||||||
break;
|
|
||||||
} catch (IOException e) {
|
|
||||||
failedCount++;
|
|
||||||
// Cached RPC connection to namenode, if any, is expected to fail once
|
|
||||||
if (failedCount > 1) {
|
|
||||||
LOG.warn("Tried waitActive() " + failedCount
|
|
||||||
+ " time(s) and failed, giving up. "
|
|
||||||
+ StringUtils.stringifyException(e));
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
LOG.info("Cluster is active");
|
LOG.info("Cluster is active");
|
||||||
}
|
}
|
||||||
|
@ -309,6 +309,14 @@ public void testSetUpFederatedCluster() throws Exception {
|
|||||||
DFSUtil.addKeySuffixes(
|
DFSUtil.addKeySuffixes(
|
||||||
DFS_NAMENODE_HTTP_ADDRESS_KEY, "ns1", "nn1")));
|
DFS_NAMENODE_HTTP_ADDRESS_KEY, "ns1", "nn1")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Shutdown namenodes individually.
|
||||||
|
cluster.shutdownNameNode(0);
|
||||||
|
cluster.shutdownNameNode(1);
|
||||||
|
|
||||||
|
// Restart namenodes individually with wait active, both should be successful.
|
||||||
|
cluster.restartNameNode(0);
|
||||||
|
cluster.restartNameNode(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user