YARN-2709. Made timeline client getDelegationToken API retry if ConnectException happens. Contributed by Li Lu.
This commit is contained in:
parent
4baca311ff
commit
b2942762d7
@ -382,6 +382,9 @@ Release 2.6.0 - UNRELEASED
|
||||
YARN-90. NodeManager should identify failed disks becoming good again
|
||||
(Varun Vasudev via jlowe)
|
||||
|
||||
YARN-2709. Made timeline client getDelegationToken API retry if ConnectException
|
||||
happens. (Li Lu via zjshen)
|
||||
|
||||
OPTIMIZATIONS
|
||||
|
||||
BUG FIXES
|
||||
|
@ -107,9 +107,23 @@ public class TimelineClientImpl extends TimelineClient {
|
||||
private URI resURI;
|
||||
private boolean isEnabled;
|
||||
|
||||
private TimelineJerseyRetryFilter retryFilter;
|
||||
@Private
|
||||
@VisibleForTesting
|
||||
TimelineClientConnectionRetry connectionRetry;
|
||||
|
||||
static class TimelineJerseyRetryFilter extends ClientFilter {
|
||||
// Abstract class for an operation that should be retried by timeline client
|
||||
private static abstract class TimelineClientRetryOp {
|
||||
// The operation that should be retried
|
||||
public abstract Object run() throws IOException;
|
||||
// The method to indicate if we should retry given the incoming exception
|
||||
public abstract boolean shouldRetryOn(Exception e);
|
||||
}
|
||||
|
||||
// Class to handle retry
|
||||
// Outside this class, only visible to tests
|
||||
@Private
|
||||
@VisibleForTesting
|
||||
static class TimelineClientConnectionRetry {
|
||||
// maxRetries < 0 means keep trying
|
||||
@Private
|
||||
@VisibleForTesting
|
||||
@ -119,14 +133,14 @@ static class TimelineJerseyRetryFilter extends ClientFilter {
|
||||
@VisibleForTesting
|
||||
public long retryInterval;
|
||||
|
||||
// Indicates if retries happened last time
|
||||
// Indicates if retries happened last time. Only tests should read it.
|
||||
// In unit tests, retryOn() calls should _not_ be concurrent.
|
||||
@Private
|
||||
@VisibleForTesting
|
||||
public boolean retried = false;
|
||||
|
||||
// Constructor with default retry settings
|
||||
public TimelineJerseyRetryFilter(Configuration conf) {
|
||||
super();
|
||||
public TimelineClientConnectionRetry(Configuration conf) {
|
||||
maxRetries = conf.getInt(
|
||||
YarnConfiguration.TIMELINE_SERVICE_CLIENT_MAX_RETRIES,
|
||||
YarnConfiguration.DEFAULT_TIMELINE_SERVICE_CLIENT_MAX_RETRIES);
|
||||
@ -135,32 +149,36 @@ public TimelineJerseyRetryFilter(Configuration conf) {
|
||||
YarnConfiguration.DEFAULT_TIMELINE_SERVICE_CLIENT_RETRY_INTERVAL_MS);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ClientResponse handle(ClientRequest cr)
|
||||
throws ClientHandlerException {
|
||||
public Object retryOn(TimelineClientRetryOp op)
|
||||
throws RuntimeException, IOException {
|
||||
int leftRetries = maxRetries;
|
||||
retried = false;
|
||||
|
||||
// keep trying
|
||||
while (true) {
|
||||
try {
|
||||
// try pass the request on, if fail, keep retrying
|
||||
return getNext().handle(cr);
|
||||
} catch (ClientHandlerException e) {
|
||||
// try perform the op, if fail, keep retrying
|
||||
return op.run();
|
||||
} catch (IOException e) {
|
||||
// We may only throw runtime and IO exceptions. After switching to
|
||||
// Java 1.7, we can merge these two catch blocks into one.
|
||||
|
||||
// break if there's no retries left
|
||||
if (leftRetries == 0) {
|
||||
break;
|
||||
}
|
||||
if(e.getCause() instanceof ConnectException) {
|
||||
if (leftRetries > 0) {
|
||||
LOG.info("Connection Timeout (" + cr.getURI() + "), will try "
|
||||
+ leftRetries + " more time(s).");
|
||||
if (op.shouldRetryOn(e)) {
|
||||
logException(e, leftRetries);
|
||||
} else {
|
||||
// note that maxRetries may be -1 at the very beginning
|
||||
// maxRetries = -1 means keep trying
|
||||
LOG.info("Connection Timeout (" + cr.getURI()
|
||||
+ "), will keep retrying.");
|
||||
throw e;
|
||||
}
|
||||
retried = true;
|
||||
} catch (RuntimeException e) {
|
||||
// break if there's no retries left
|
||||
if (leftRetries == 0) {
|
||||
break;
|
||||
}
|
||||
if (op.shouldRetryOn(e)) {
|
||||
logException(e, leftRetries);
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
@ -168,6 +186,7 @@ public ClientResponse handle(ClientRequest cr)
|
||||
if (leftRetries > 0) {
|
||||
leftRetries--;
|
||||
}
|
||||
retried = true;
|
||||
try {
|
||||
// sleep for the given time interval
|
||||
Thread.sleep(retryInterval);
|
||||
@ -175,10 +194,51 @@ public ClientResponse handle(ClientRequest cr)
|
||||
LOG.warn("Client retry sleep interrupted! ");
|
||||
}
|
||||
}
|
||||
throw new ClientHandlerException("Failed to connect to timeline server. "
|
||||
throw new RuntimeException("Failed to connect to timeline server. "
|
||||
+ "Connection retries limit exceeded. "
|
||||
+ "The posted timeline event may be missing");
|
||||
};
|
||||
|
||||
private void logException(Exception e, int leftRetries) {
|
||||
if (leftRetries > 0) {
|
||||
LOG.info("Exception caught by TimelineClientConnectionRetry,"
|
||||
+ " will try " + leftRetries + " more time(s).\nMessage: "
|
||||
+ e.getMessage());
|
||||
} else {
|
||||
// note that maxRetries may be -1 at the very beginning
|
||||
LOG.info("ConnectionException caught by TimelineClientConnectionRetry,"
|
||||
+ " will keep retrying.\nMessage: "
|
||||
+ e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class TimelineJerseyRetryFilter extends ClientFilter {
|
||||
@Override
|
||||
public ClientResponse handle(final ClientRequest cr)
|
||||
throws ClientHandlerException {
|
||||
// Set up the retry operation
|
||||
TimelineClientRetryOp jerseyRetryOp = new TimelineClientRetryOp() {
|
||||
@Override
|
||||
public Object run() {
|
||||
// Try pass the request, if fail, keep retrying
|
||||
return getNext().handle(cr);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean shouldRetryOn(Exception e) {
|
||||
// Only retry on connection exceptions
|
||||
return (e instanceof ClientHandlerException)
|
||||
&& (e.getCause() instanceof ConnectException);
|
||||
}
|
||||
};
|
||||
try {
|
||||
return (ClientResponse) connectionRetry.retryOn(jerseyRetryOp);
|
||||
} catch (IOException e) {
|
||||
throw new ClientHandlerException("Jersey retry failed!\nMessage: "
|
||||
+ e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public TimelineClientImpl() {
|
||||
@ -201,10 +261,12 @@ protected void serviceInit(Configuration conf) throws Exception {
|
||||
authenticator = new PseudoDelegationTokenAuthenticator();
|
||||
}
|
||||
authenticator.setConnectionConfigurator(connConfigurator);
|
||||
token = new DelegationTokenAuthenticatedURL.Token();
|
||||
|
||||
connectionRetry = new TimelineClientConnectionRetry(conf);
|
||||
client = new Client(new URLConnectionClientHandler(
|
||||
new TimelineURLConnectionFactory()), cc);
|
||||
token = new DelegationTokenAuthenticatedURL.Token();
|
||||
retryFilter = new TimelineJerseyRetryFilter(conf);
|
||||
TimelineJerseyRetryFilter retryFilter = new TimelineJerseyRetryFilter();
|
||||
client.addFilter(retryFilter);
|
||||
|
||||
if (YarnConfiguration.useHttps(conf)) {
|
||||
@ -282,6 +344,11 @@ private ClientResponse doPosting(Object obj, String path) throws IOException, Ya
|
||||
@Override
|
||||
public Token<TimelineDelegationTokenIdentifier> getDelegationToken(
|
||||
final String renewer) throws IOException, YarnException {
|
||||
// Set up the retry operation
|
||||
TimelineClientRetryOp tokenRetryOp = new TimelineClientRetryOp() {
|
||||
@Override
|
||||
public Object run() throws IOException {
|
||||
// Try pass the request, if fail, keep retrying
|
||||
boolean isProxyAccess =
|
||||
UserGroupInformation.getCurrentUser().getAuthenticationMethod()
|
||||
== UserGroupInformation.AuthenticationMethod.PROXY;
|
||||
@ -307,11 +374,15 @@ public Token<TimelineDelegationTokenIdentifier> run() throws Exception {
|
||||
throw new IOException(e);
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public boolean shouldRetryOn(Exception e) {
|
||||
// Only retry on connection exceptions
|
||||
return (e instanceof ConnectException);
|
||||
}
|
||||
};
|
||||
|
||||
@Private
|
||||
@VisibleForTesting
|
||||
public TimelineJerseyRetryFilter getRetryFilter() {
|
||||
return retryFilter;
|
||||
return (Token<TimelineDelegationTokenIdentifier>)
|
||||
connectionRetry.retryOn(tokenRetryOp);
|
||||
}
|
||||
|
||||
@Private
|
||||
|
@ -27,6 +27,8 @@
|
||||
|
||||
import java.net.ConnectException;
|
||||
|
||||
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hadoop.yarn.api.records.timeline.TimelineEntities;
|
||||
import org.apache.hadoop.yarn.api.records.timeline.TimelineEntity;
|
||||
import org.apache.hadoop.yarn.api.records.timeline.TimelineEvent;
|
||||
@ -183,8 +185,8 @@ public void testPutDomainConnectionRefused() throws Exception {
|
||||
|
||||
@Test
|
||||
public void testCheckRetryCount() throws Exception {
|
||||
int newMaxRetries = 1;
|
||||
long newIntervalMs = 1500;
|
||||
int newMaxRetries = 5;
|
||||
long newIntervalMs = 500;
|
||||
YarnConfiguration conf = new YarnConfiguration();
|
||||
conf.setInt(YarnConfiguration.TIMELINE_SERVICE_CLIENT_MAX_RETRIES,
|
||||
newMaxRetries);
|
||||
@ -197,13 +199,44 @@ public void testCheckRetryCount() throws Exception {
|
||||
client.putEntities(generateEntity());
|
||||
Assert.fail("Exception expected!"
|
||||
+ "Timeline server should be off to run this test. ");
|
||||
} catch (ClientHandlerException ce) {
|
||||
} catch (RuntimeException ce) {
|
||||
Assert.assertTrue(
|
||||
"Handler exception for reason other than retry: " + ce.getMessage(),
|
||||
ce.getMessage().contains("Connection retries limit exceeded"));
|
||||
// we would expect this exception here, check if the client has retried
|
||||
Assert.assertTrue("Retry filter didn't perform any retries! ", client
|
||||
.getRetryFilter().retried);
|
||||
.connectionRetry.retried);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTokenRetry() throws Exception {
|
||||
int newMaxRetries = 5;
|
||||
long newIntervalMs = 500;
|
||||
YarnConfiguration conf = new YarnConfiguration();
|
||||
conf.setInt(YarnConfiguration.TIMELINE_SERVICE_CLIENT_MAX_RETRIES,
|
||||
newMaxRetries);
|
||||
conf.setLong(YarnConfiguration.TIMELINE_SERVICE_CLIENT_RETRY_INTERVAL_MS,
|
||||
newIntervalMs);
|
||||
conf.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true);
|
||||
// use kerberos to bypass the issue in HADOOP-11215
|
||||
conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION,
|
||||
"kerberos");
|
||||
UserGroupInformation.setConfiguration(conf);
|
||||
TimelineClientImpl client = createTimelineClient(conf);
|
||||
try {
|
||||
// try getting a delegation token
|
||||
client.getDelegationToken(
|
||||
UserGroupInformation.getCurrentUser().getShortUserName());
|
||||
Assert.fail("Exception expected!"
|
||||
+ "Timeline server should be off to run this test. ");
|
||||
} catch (RuntimeException ce) {
|
||||
Assert.assertTrue(
|
||||
"Handler exception for reason other than retry: " + ce.toString(), ce
|
||||
.getMessage().contains("Connection retries limit exceeded"));
|
||||
// we would expect this exception here, check if the client has retried
|
||||
Assert.assertTrue("Retry filter didn't perform any retries! ",
|
||||
client.connectionRetry.retried);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user