HADOOP-13590. Retry until TGT expires even if the UGI renewal thread encountered exception.
This commit is contained in:
parent
280357c29f
commit
367c3d4121
@ -43,6 +43,7 @@
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import javax.security.auth.Subject;
|
||||
import javax.security.auth.callback.CallbackHandler;
|
||||
@ -54,14 +55,18 @@
|
||||
import javax.security.auth.login.LoginException;
|
||||
import javax.security.auth.spi.LoginModule;
|
||||
|
||||
import org.apache.hadoop.io.retry.RetryPolicies;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.retry.RetryPolicy;
|
||||
import org.apache.hadoop.metrics2.annotation.Metric;
|
||||
import org.apache.hadoop.metrics2.annotation.Metrics;
|
||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
|
||||
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
|
||||
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
|
||||
import org.apache.hadoop.metrics2.lib.MutableQuantiles;
|
||||
import org.apache.hadoop.metrics2.lib.MutableRate;
|
||||
import org.apache.hadoop.security.SaslRpcServer.AuthMethod;
|
||||
@ -85,7 +90,8 @@
|
||||
@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce", "HBase", "Hive", "Oozie"})
|
||||
@InterfaceStability.Evolving
|
||||
public class UserGroupInformation {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(
|
||||
@VisibleForTesting
|
||||
static final Logger LOG = LoggerFactory.getLogger(
|
||||
UserGroupInformation.class);
|
||||
|
||||
/**
|
||||
@ -121,6 +127,10 @@ static class UgiMetrics {
|
||||
MutableRate loginFailure;
|
||||
@Metric("GetGroups") MutableRate getGroups;
|
||||
MutableQuantiles[] getGroupsQuantiles;
|
||||
@Metric("Renewal failures since startup")
|
||||
private MutableGaugeLong renewalFailuresTotal;
|
||||
@Metric("Renewal failures since last successful login")
|
||||
private MutableGaugeInt renewalFailures;
|
||||
|
||||
static UgiMetrics create() {
|
||||
return DefaultMetricsSystem.instance().register(new UgiMetrics());
|
||||
@ -138,6 +148,10 @@ void addGetGroups(long latency) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MutableGaugeInt getRenewalFailures() {
|
||||
return renewalFailures;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -963,6 +977,7 @@ public void run() {
|
||||
return;
|
||||
}
|
||||
long nextRefresh = getRefreshTime(tgt);
|
||||
RetryPolicy rp = null;
|
||||
while (true) {
|
||||
try {
|
||||
long now = Time.now();
|
||||
@ -986,13 +1001,40 @@ public void run() {
|
||||
}
|
||||
nextRefresh = Math.max(getRefreshTime(tgt),
|
||||
now + kerberosMinSecondsBeforeRelogin);
|
||||
metrics.renewalFailures.set(0);
|
||||
rp = null;
|
||||
} catch (InterruptedException ie) {
|
||||
LOG.warn("Terminating renewal thread");
|
||||
return;
|
||||
} catch (IOException ie) {
|
||||
LOG.warn("Exception encountered while running the" +
|
||||
" renewal command. Aborting renew thread. " + ie);
|
||||
return;
|
||||
metrics.renewalFailuresTotal.incr();
|
||||
final long tgtEndTime = tgt.getEndTime().getTime();
|
||||
LOG.warn("Exception encountered while running the renewal "
|
||||
+ "command for {}. (TGT end time:{}, renewalFailures: {},"
|
||||
+ "renewalFailuresTotal: {})", getUserName(), tgtEndTime,
|
||||
metrics.renewalFailures, metrics.renewalFailuresTotal, ie);
|
||||
final long now = Time.now();
|
||||
if (rp == null) {
|
||||
// Use a dummy maxRetries to create the policy. The policy will
|
||||
// only be used to get next retry time with exponential back-off.
|
||||
// The final retry time will be later limited within the
|
||||
// tgt endTime in getNextTgtRenewalTime.
|
||||
rp = RetryPolicies.exponentialBackoffRetry(Long.SIZE - 2,
|
||||
kerberosMinSecondsBeforeRelogin, TimeUnit.MILLISECONDS);
|
||||
}
|
||||
try {
|
||||
nextRefresh = getNextTgtRenewalTime(tgtEndTime, now, rp);
|
||||
} catch (Exception e) {
|
||||
LOG.error("Exception when calculating next tgt renewal time", e);
|
||||
return;
|
||||
}
|
||||
metrics.renewalFailures.incr();
|
||||
// retry until close enough to tgt endTime.
|
||||
if (now > nextRefresh) {
|
||||
LOG.error("TGT is expired. Aborting renew thread for {}.",
|
||||
getUserName());
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1001,6 +1043,26 @@ public void run() {
|
||||
t.setName("TGT Renewer for " + getUserName());
|
||||
t.start();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get time for next login retry. This will allow the thread to retry with
|
||||
* exponential back-off, until tgt endtime.
|
||||
* Last retry is {@link #kerberosMinSecondsBeforeRelogin} before endtime.
|
||||
*
|
||||
* @param tgtEndTime EndTime of the tgt.
|
||||
* @param now Current time.
|
||||
* @param rp The retry policy.
|
||||
* @return Time for next login retry.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
static long getNextTgtRenewalTime(final long tgtEndTime, final long now,
|
||||
final RetryPolicy rp) throws Exception {
|
||||
final long lastRetryTime = tgtEndTime - kerberosMinSecondsBeforeRelogin;
|
||||
final RetryPolicy.RetryAction ra = rp.shouldRetry(null,
|
||||
metrics.renewalFailures.value(), 0, false);
|
||||
return Math.min(lastRetryTime, now + ra.delayMillis);
|
||||
}
|
||||
|
||||
/**
|
||||
* Log a user in from a keytab file. Loads a user identity from a keytab
|
||||
* file and logs them in. They become the currently logged-in user.
|
||||
|
@ -0,0 +1,144 @@
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* <p>
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p>
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.security;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.minikdc.MiniKdc;
|
||||
import org.apache.hadoop.security.authentication.util.KerberosUtil;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.apache.hadoop.test.LambdaTestUtils;
|
||||
import org.apache.hadoop.util.PlatformName;
|
||||
import org.apache.log4j.Level;
|
||||
import org.junit.After;
|
||||
import org.junit.Test;
|
||||
|
||||
import javax.security.auth.Subject;
|
||||
import javax.security.auth.kerberos.KerberosPrincipal;
|
||||
import javax.security.auth.login.AppConfigurationEntry;
|
||||
import javax.security.auth.login.LoginContext;
|
||||
import java.io.File;
|
||||
import java.security.Principal;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN;
|
||||
|
||||
/**
|
||||
* Test {@link UserGroupInformation} with a minikdc.
|
||||
*/
|
||||
public class TestUGIWithMiniKdc {
|
||||
|
||||
private static MiniKdc kdc;
|
||||
|
||||
@After
|
||||
public void teardown() {
|
||||
UserGroupInformation.reset();
|
||||
if (kdc != null) {
|
||||
kdc.stop();
|
||||
}
|
||||
}
|
||||
|
||||
private void setupKdc() throws Exception {
|
||||
Properties kdcConf = MiniKdc.createConf();
|
||||
// tgt expire time = 30 seconds
|
||||
kdcConf.setProperty(MiniKdc.MAX_TICKET_LIFETIME, "30");
|
||||
kdcConf.setProperty(MiniKdc.MIN_TICKET_LIFETIME, "30");
|
||||
File kdcDir = new File(System.getProperty("test.dir", "target"));
|
||||
kdc = new MiniKdc(kdcConf, kdcDir);
|
||||
kdc.start();
|
||||
}
|
||||
|
||||
@Test(timeout = 120000)
|
||||
public void testAutoRenewalThreadRetryWithKdc() throws Exception {
|
||||
GenericTestUtils.setLogLevel(UserGroupInformation.LOG, Level.DEBUG);
|
||||
final Configuration conf = new Configuration();
|
||||
// Relogin every 1 second
|
||||
conf.setLong(HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN, 1);
|
||||
SecurityUtil.setAuthenticationMethod(
|
||||
UserGroupInformation.AuthenticationMethod.KERBEROS, conf);
|
||||
UserGroupInformation.setConfiguration(conf);
|
||||
|
||||
LoginContext loginContext = null;
|
||||
try {
|
||||
final String principal = "foo";
|
||||
final File workDir = new File(System.getProperty("test.dir", "target"));
|
||||
final File keytab = new File(workDir, "foo.keytab");
|
||||
final Set<Principal> principals = new HashSet<>();
|
||||
principals.add(new KerberosPrincipal(principal));
|
||||
setupKdc();
|
||||
kdc.createPrincipal(keytab, principal);
|
||||
|
||||
// client login
|
||||
final Subject subject =
|
||||
new Subject(false, principals, new HashSet<>(), new HashSet<>());
|
||||
|
||||
loginContext = new LoginContext("", subject, null,
|
||||
new javax.security.auth.login.Configuration() {
|
||||
@Override
|
||||
public AppConfigurationEntry[] getAppConfigurationEntry(
|
||||
String name) {
|
||||
Map<String, String> options = new HashMap<>();
|
||||
options.put("principal", principal);
|
||||
options.put("refreshKrb5Config", "true");
|
||||
if (PlatformName.IBM_JAVA) {
|
||||
options.put("useKeytab", keytab.getPath());
|
||||
options.put("credsType", "both");
|
||||
} else {
|
||||
options.put("keyTab", keytab.getPath());
|
||||
options.put("useKeyTab", "true");
|
||||
options.put("storeKey", "true");
|
||||
options.put("doNotPrompt", "true");
|
||||
options.put("useTicketCache", "true");
|
||||
options.put("renewTGT", "true");
|
||||
options.put("isInitiator", Boolean.toString(true));
|
||||
}
|
||||
String ticketCache = System.getenv("KRB5CCNAME");
|
||||
if (ticketCache != null) {
|
||||
options.put("ticketCache", ticketCache);
|
||||
}
|
||||
options.put("debug", "true");
|
||||
return new AppConfigurationEntry[] {new AppConfigurationEntry(
|
||||
KerberosUtil.getKrb5LoginModuleName(),
|
||||
AppConfigurationEntry.LoginModuleControlFlag.REQUIRED,
|
||||
options)};
|
||||
}
|
||||
});
|
||||
loginContext.login();
|
||||
final Subject loginSubject = loginContext.getSubject();
|
||||
UserGroupInformation.loginUserFromSubject(loginSubject);
|
||||
|
||||
// Verify retry happens. Do not verify retry count to reduce flakiness.
|
||||
// Detailed back-off logic is tested separately in
|
||||
// TestUserGroupInformation#testGetNextRetryTime
|
||||
LambdaTestUtils.await(30000, 500,
|
||||
() -> {
|
||||
final int count =
|
||||
UserGroupInformation.metrics.getRenewalFailures().value();
|
||||
UserGroupInformation.LOG.info("Renew failure count is {}", count);
|
||||
return count > 0;
|
||||
});
|
||||
} finally {
|
||||
if (loginContext != null) {
|
||||
loginContext.logout();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -20,6 +20,8 @@
|
||||
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.retry.RetryPolicies;
|
||||
import org.apache.hadoop.io.retry.RetryPolicy;
|
||||
import org.apache.hadoop.ipc.TestRpcBase.TestTokenIdentifier;
|
||||
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
|
||||
import org.apache.hadoop.security.SaslRpcServer.AuthMethod;
|
||||
@ -27,13 +29,18 @@
|
||||
import org.apache.hadoop.security.authentication.util.KerberosName;
|
||||
import org.apache.hadoop.security.token.Token;
|
||||
import org.apache.hadoop.security.token.TokenIdentifier;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.apache.hadoop.util.Shell;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.util.Time;
|
||||
import org.apache.log4j.Level;
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.security.auth.Subject;
|
||||
import javax.security.auth.kerberos.KerberosPrincipal;
|
||||
@ -49,10 +56,13 @@
|
||||
import java.security.PrivilegedExceptionAction;
|
||||
import java.util.Collection;
|
||||
import java.util.ConcurrentModificationException;
|
||||
import java.util.Date;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS;
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN;
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTH_TO_LOCAL;
|
||||
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
|
||||
import static org.apache.hadoop.test.MetricsAsserts.assertCounterGt;
|
||||
@ -74,6 +84,9 @@
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
public class TestUserGroupInformation {
|
||||
|
||||
static final Logger LOG = LoggerFactory.getLogger(
|
||||
TestUserGroupInformation.class);
|
||||
final private static String USER_NAME = "user1@HADOOP.APACHE.ORG";
|
||||
final private static String GROUP1_NAME = "group1";
|
||||
final private static String GROUP2_NAME = "group2";
|
||||
@ -1027,6 +1040,86 @@ public Void run() throws IOException {
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetNextRetryTime() throws Exception {
|
||||
GenericTestUtils.setLogLevel(UserGroupInformation.LOG, Level.DEBUG);
|
||||
final long reloginInterval = 1;
|
||||
final long reloginIntervalMs = reloginInterval * 1000;
|
||||
// Relogin happens every 1 second.
|
||||
conf.setLong(HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN, reloginInterval);
|
||||
SecurityUtil.setAuthenticationMethod(AuthenticationMethod.KERBEROS, conf);
|
||||
UserGroupInformation.setConfiguration(conf);
|
||||
|
||||
// Suppose tgt start time is now, end time is 20 seconds from now.
|
||||
final long now = Time.now();
|
||||
final Date endDate = new Date(now + 20000);
|
||||
|
||||
// Explicitly test the exponential back-off logic.
|
||||
// Suppose some time (10 seconds) passed.
|
||||
// Verify exponential backoff and max=(login interval before endTime).
|
||||
final long currentTime = now + 10000;
|
||||
final long endTime = endDate.getTime();
|
||||
|
||||
assertEquals(0, UserGroupInformation.metrics.getRenewalFailures().value());
|
||||
RetryPolicy rp = RetryPolicies.exponentialBackoffRetry(Long.SIZE - 2,
|
||||
1000, TimeUnit.MILLISECONDS);
|
||||
long lastRetry =
|
||||
UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
|
||||
assertWithinBounds(
|
||||
UserGroupInformation.metrics.getRenewalFailures().value(),
|
||||
lastRetry, reloginIntervalMs, currentTime);
|
||||
|
||||
UserGroupInformation.metrics.getRenewalFailures().incr();
|
||||
lastRetry =
|
||||
UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
|
||||
assertWithinBounds(
|
||||
UserGroupInformation.metrics.getRenewalFailures().value(),
|
||||
lastRetry, reloginIntervalMs, currentTime);
|
||||
|
||||
UserGroupInformation.metrics.getRenewalFailures().incr();
|
||||
lastRetry =
|
||||
UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
|
||||
assertWithinBounds(
|
||||
UserGroupInformation.metrics.getRenewalFailures().value(),
|
||||
lastRetry, reloginIntervalMs, currentTime);
|
||||
|
||||
UserGroupInformation.metrics.getRenewalFailures().incr();
|
||||
lastRetry =
|
||||
UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
|
||||
assertWithinBounds(
|
||||
UserGroupInformation.metrics.getRenewalFailures().value(),
|
||||
lastRetry, reloginIntervalMs, currentTime);
|
||||
|
||||
// last try should be right before expiry.
|
||||
UserGroupInformation.metrics.getRenewalFailures().incr();
|
||||
lastRetry =
|
||||
UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
|
||||
String str =
|
||||
"5th retry, now:" + currentTime + ", retry:" + lastRetry;
|
||||
LOG.info(str);
|
||||
assertEquals(str, endTime - reloginIntervalMs, lastRetry);
|
||||
|
||||
// make sure no more retries after (tgt endTime - login interval).
|
||||
UserGroupInformation.metrics.getRenewalFailures().incr();
|
||||
lastRetry =
|
||||
UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
|
||||
str = "overflow retry, now:" + currentTime + ", retry:" + lastRetry;
|
||||
LOG.info(str);
|
||||
assertEquals(str, endTime - reloginIntervalMs, lastRetry);
|
||||
}
|
||||
|
||||
private void assertWithinBounds(final int numFailures, final long lastRetry,
|
||||
final long reloginIntervalMs, long now) {
|
||||
// shift is 2 to the power of (numFailure).
|
||||
int shift = numFailures + 1;
|
||||
final long lower = now + reloginIntervalMs * (long)((1 << shift) * 0.5);
|
||||
final long upper = now + reloginIntervalMs * (long)((1 << shift) * 1.5);
|
||||
final String str = new String("Retry#" + (numFailures + 1) + ", now:" + now
|
||||
+ ", lower bound:" + lower + ", upper bound:" + upper
|
||||
+ ", retry:" + lastRetry);
|
||||
LOG.info(str);
|
||||
assertTrue(str, lower <= lastRetry && lastRetry < upper);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user