YARN-1641. ZK store should attempt a write periodically to ensure it is still Active. (kasha)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1567628 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d82bc78733
commit
41ec3cce76
@ -176,6 +176,9 @@ Release 2.4.0 - UNRELEASED
|
||||
YARN-1706. Created an utility method to dump timeline records to JSON
|
||||
strings. (zjshen)
|
||||
|
||||
YARN-1641. ZK store should attempt a write periodically to ensure it is
|
||||
still Active. (kasha)
|
||||
|
||||
OPTIMIZATIONS
|
||||
|
||||
BUG FIXES
|
||||
|
@ -676,11 +676,11 @@ protected void handleStoreEvent(RMStateStoreEvent event) {
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
/**
|
||||
* In {#handleStoreEvent}, this method is called to notify the
|
||||
* ResourceManager that the store operation has failed.
|
||||
* This method is called to notify the ResourceManager that the store
|
||||
* operation has failed.
|
||||
* @param failureCause the exception due to which the operation failed
|
||||
*/
|
||||
private void notifyStoreOperationFailed(Exception failureCause) {
|
||||
protected void notifyStoreOperationFailed(Exception failureCause) {
|
||||
RMFatalEventType type;
|
||||
if (failureCause instanceof StoreFencedException) {
|
||||
type = RMFatalEventType.STATE_STORE_FENCED;
|
||||
|
@ -137,6 +137,7 @@ public class ZKRMStateStore extends RMStateStore {
|
||||
private String fencingNodePath;
|
||||
private Op createFencingNodePathOp;
|
||||
private Op deleteFencingNodePathOp;
|
||||
private Thread verifyActiveStatusThread;
|
||||
private String zkRootNodeUsername;
|
||||
private final String zkRootNodePassword = Long.toString(random.nextLong());
|
||||
|
||||
@ -258,6 +259,8 @@ public synchronized void startInternal() throws Exception {
|
||||
createRootDir(zkRootNodePath);
|
||||
if (HAUtil.isHAEnabled(getConfig())){
|
||||
fence();
|
||||
verifyActiveStatusThread = new VerifyActiveStatusThread();
|
||||
verifyActiveStatusThread.start();
|
||||
}
|
||||
createRootDir(rmAppRoot);
|
||||
createRootDir(rmDTSecretManagerRoot);
|
||||
@ -350,6 +353,10 @@ private synchronized void closeZkClients() throws IOException {
|
||||
|
||||
@Override
|
||||
protected synchronized void closeInternal() throws Exception {
|
||||
if (verifyActiveStatusThread != null) {
|
||||
verifyActiveStatusThread.interrupt();
|
||||
verifyActiveStatusThread.join(1000);
|
||||
}
|
||||
closeZkClients();
|
||||
}
|
||||
|
||||
@ -856,6 +863,32 @@ List<String> run() throws KeeperException, InterruptedException {
|
||||
}.runWithRetries();
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper class that periodically attempts creating a znode to ensure that
|
||||
* this RM continues to be the Active.
|
||||
*/
|
||||
private class VerifyActiveStatusThread extends Thread {
|
||||
private List<Op> emptyOpList = new ArrayList<Op>();
|
||||
|
||||
VerifyActiveStatusThread() {
|
||||
super(VerifyActiveStatusThread.class.getName());
|
||||
}
|
||||
|
||||
public void run() {
|
||||
try {
|
||||
while (true) {
|
||||
doMultiWithRetries(emptyOpList);
|
||||
Thread.sleep(zkSessionTimeout);
|
||||
}
|
||||
} catch (InterruptedException ie) {
|
||||
LOG.info(VerifyActiveStatusThread.class.getName() + " thread " +
|
||||
"interrupted! Exiting!");
|
||||
} catch (Exception e) {
|
||||
notifyStoreOperationFailed(new StoreFencedException());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private abstract class ZKAction<T> {
|
||||
// run() expects synchronization on ZKRMStateStore.this
|
||||
abstract T run() throws KeeperException, InterruptedException;
|
||||
|
@ -23,10 +23,7 @@
|
||||
import static org.mockito.Mockito.mock;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
@ -34,15 +31,8 @@
|
||||
import org.apache.hadoop.ha.HAServiceProtocol;
|
||||
import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
|
||||
import org.apache.hadoop.service.Service;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.SubmitApplicationRequest;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
|
||||
import org.apache.hadoop.yarn.api.records.Priority;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.conf.HAUtil;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.ClientRMService;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.RMStateVersion;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.impl.pb.RMStateVersionPBImpl;
|
||||
@ -54,6 +44,7 @@
|
||||
public class TestZKRMStateStore extends RMStateStoreTestBase {
|
||||
|
||||
public static final Log LOG = LogFactory.getLog(TestZKRMStateStore.class);
|
||||
private static final int ZK_TIMEOUT_MS = 1000;
|
||||
|
||||
class TestZKRMStateStoreTester implements RMStateStoreHelper {
|
||||
|
||||
@ -141,6 +132,7 @@ private Configuration createHARMConf(
|
||||
conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
|
||||
conf.set(YarnConfiguration.RM_STORE, ZKRMStateStore.class.getName());
|
||||
conf.set(YarnConfiguration.RM_ZK_ADDRESS, hostPort);
|
||||
conf.setInt(YarnConfiguration.RM_ZK_TIMEOUT_MS, ZK_TIMEOUT_MS);
|
||||
conf.set(YarnConfiguration.RM_HA_ID, rmId);
|
||||
for (String rpcAddress : YarnConfiguration.RM_SERVICES_ADDRESS_CONF_KEYS) {
|
||||
for (String id : HAUtil.getRMHAIds(conf)) {
|
||||
@ -182,26 +174,7 @@ public void testFencing() throws Exception {
|
||||
HAServiceProtocol.HAServiceState.ACTIVE,
|
||||
rm2.getRMContext().getRMAdminService().getServiceStatus().getState());
|
||||
|
||||
// Submitting an application to RM1 to trigger a state store operation.
|
||||
// RM1 should realize that it got fenced and is not the Active RM anymore.
|
||||
Map mockMap = mock(Map.class);
|
||||
ApplicationSubmissionContext asc =
|
||||
ApplicationSubmissionContext.newInstance(
|
||||
ApplicationId.newInstance(1000, 1),
|
||||
"testApplication", // app Name
|
||||
"default", // queue name
|
||||
Priority.newInstance(0),
|
||||
ContainerLaunchContext.newInstance(mockMap, mockMap,
|
||||
new ArrayList<String>(), mockMap, mock(ByteBuffer.class),
|
||||
mockMap),
|
||||
false, // unmanaged AM
|
||||
true, // cancelTokens
|
||||
1, // max app attempts
|
||||
Resource.newInstance(1024, 1));
|
||||
ClientRMService rmService = rm1.getClientRMService();
|
||||
rmService.submitApplication(SubmitApplicationRequest.newInstance(asc));
|
||||
|
||||
for (int i = 0; i < 30; i++) {
|
||||
for (int i = 0; i < ZK_TIMEOUT_MS / 50; i++) {
|
||||
if (HAServiceProtocol.HAServiceState.ACTIVE ==
|
||||
rm1.getRMContext().getRMAdminService().getServiceStatus().getState()) {
|
||||
Thread.sleep(100);
|
||||
|
Loading…
Reference in New Issue
Block a user