From 367c3d41217728c2e61252c5a5235e5bc1f9822f Mon Sep 17 00:00:00 2001 From: Xiao Chen Date: Wed, 9 Nov 2016 09:07:12 -0800 Subject: [PATCH] HADOOP-13590. Retry until TGT expires even if the UGI renewal thread encountered exception. --- .../hadoop/security/UserGroupInformation.java | 70 ++++++++- .../hadoop/security/TestUGIWithMiniKdc.java | 144 ++++++++++++++++++ .../security/TestUserGroupInformation.java | 93 +++++++++++ 3 files changed, 303 insertions(+), 4 deletions(-) create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUGIWithMiniKdc.java diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java index 111c3f8ce6..82603a40d9 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java @@ -43,6 +43,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.TimeUnit; import javax.security.auth.Subject; import javax.security.auth.callback.CallbackHandler; @@ -54,14 +55,18 @@ import javax.security.auth.login.LoginException; import javax.security.auth.spi.LoginModule; +import org.apache.hadoop.io.retry.RetryPolicies; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.metrics2.annotation.Metric; import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.metrics2.lib.MetricsRegistry; +import org.apache.hadoop.metrics2.lib.MutableGaugeInt; +import org.apache.hadoop.metrics2.lib.MutableGaugeLong; import org.apache.hadoop.metrics2.lib.MutableQuantiles; import org.apache.hadoop.metrics2.lib.MutableRate; import org.apache.hadoop.security.SaslRpcServer.AuthMethod; @@ -85,7 +90,8 @@ @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce", "HBase", "Hive", "Oozie"}) @InterfaceStability.Evolving public class UserGroupInformation { - private static final Logger LOG = LoggerFactory.getLogger( + @VisibleForTesting + static final Logger LOG = LoggerFactory.getLogger( UserGroupInformation.class); /** @@ -121,6 +127,10 @@ static class UgiMetrics { MutableRate loginFailure; @Metric("GetGroups") MutableRate getGroups; MutableQuantiles[] getGroupsQuantiles; + @Metric("Renewal failures since startup") + private MutableGaugeLong renewalFailuresTotal; + @Metric("Renewal failures since last successful login") + private MutableGaugeInt renewalFailures; static UgiMetrics create() { return DefaultMetricsSystem.instance().register(new UgiMetrics()); @@ -138,6 +148,10 @@ void addGetGroups(long latency) { } } } + + MutableGaugeInt getRenewalFailures() { + return renewalFailures; + } } /** @@ -963,6 +977,7 @@ public void run() { return; } long nextRefresh = getRefreshTime(tgt); + RetryPolicy rp = null; while (true) { try { long now = Time.now(); @@ -986,13 +1001,40 @@ public void run() { } nextRefresh = Math.max(getRefreshTime(tgt), now + kerberosMinSecondsBeforeRelogin); + metrics.renewalFailures.set(0); + rp = null; } catch (InterruptedException ie) { LOG.warn("Terminating renewal thread"); return; } catch (IOException ie) { - LOG.warn("Exception encountered while running the" + - " renewal command. Aborting renew thread. " + ie); - return; + metrics.renewalFailuresTotal.incr(); + final long tgtEndTime = tgt.getEndTime().getTime(); + LOG.warn("Exception encountered while running the renewal " + + "command for {}. (TGT end time:{}, renewalFailures: {}," + + "renewalFailuresTotal: {})", getUserName(), tgtEndTime, + metrics.renewalFailures, metrics.renewalFailuresTotal, ie); + final long now = Time.now(); + if (rp == null) { + // Use a dummy maxRetries to create the policy. The policy will + // only be used to get next retry time with exponential back-off. + // The final retry time will be later limited within the + // tgt endTime in getNextTgtRenewalTime. + rp = RetryPolicies.exponentialBackoffRetry(Long.SIZE - 2, + kerberosMinSecondsBeforeRelogin, TimeUnit.MILLISECONDS); + } + try { + nextRefresh = getNextTgtRenewalTime(tgtEndTime, now, rp); + } catch (Exception e) { + LOG.error("Exception when calculating next tgt renewal time", e); + return; + } + metrics.renewalFailures.incr(); + // retry until close enough to tgt endTime. + if (now > nextRefresh) { + LOG.error("TGT is expired. Aborting renew thread for {}.", + getUserName()); + return; + } } } } @@ -1001,6 +1043,26 @@ public void run() { t.setName("TGT Renewer for " + getUserName()); t.start(); } + + /** + * Get time for next login retry. This will allow the thread to retry with + * exponential back-off, until tgt endtime. + * Last retry is {@link #kerberosMinSecondsBeforeRelogin} before endtime. + * + * @param tgtEndTime EndTime of the tgt. + * @param now Current time. + * @param rp The retry policy. + * @return Time for next login retry. + */ + @VisibleForTesting + static long getNextTgtRenewalTime(final long tgtEndTime, final long now, + final RetryPolicy rp) throws Exception { + final long lastRetryTime = tgtEndTime - kerberosMinSecondsBeforeRelogin; + final RetryPolicy.RetryAction ra = rp.shouldRetry(null, + metrics.renewalFailures.value(), 0, false); + return Math.min(lastRetryTime, now + ra.delayMillis); + } + /** * Log a user in from a keytab file. Loads a user identity from a keytab * file and logs them in. They become the currently logged-in user. diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUGIWithMiniKdc.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUGIWithMiniKdc.java new file mode 100644 index 0000000000..0016a65e22 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUGIWithMiniKdc.java @@ -0,0 +1,144 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.security; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.minikdc.MiniKdc; +import org.apache.hadoop.security.authentication.util.KerberosUtil; +import org.apache.hadoop.test.GenericTestUtils; +import org.apache.hadoop.test.LambdaTestUtils; +import org.apache.hadoop.util.PlatformName; +import org.apache.log4j.Level; +import org.junit.After; +import org.junit.Test; + +import javax.security.auth.Subject; +import javax.security.auth.kerberos.KerberosPrincipal; +import javax.security.auth.login.AppConfigurationEntry; +import javax.security.auth.login.LoginContext; +import java.io.File; +import java.security.Principal; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN; + +/** + * Test {@link UserGroupInformation} with a minikdc. + */ +public class TestUGIWithMiniKdc { + + private static MiniKdc kdc; + + @After + public void teardown() { + UserGroupInformation.reset(); + if (kdc != null) { + kdc.stop(); + } + } + + private void setupKdc() throws Exception { + Properties kdcConf = MiniKdc.createConf(); + // tgt expire time = 30 seconds + kdcConf.setProperty(MiniKdc.MAX_TICKET_LIFETIME, "30"); + kdcConf.setProperty(MiniKdc.MIN_TICKET_LIFETIME, "30"); + File kdcDir = new File(System.getProperty("test.dir", "target")); + kdc = new MiniKdc(kdcConf, kdcDir); + kdc.start(); + } + + @Test(timeout = 120000) + public void testAutoRenewalThreadRetryWithKdc() throws Exception { + GenericTestUtils.setLogLevel(UserGroupInformation.LOG, Level.DEBUG); + final Configuration conf = new Configuration(); + // Relogin every 1 second + conf.setLong(HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN, 1); + SecurityUtil.setAuthenticationMethod( + UserGroupInformation.AuthenticationMethod.KERBEROS, conf); + UserGroupInformation.setConfiguration(conf); + + LoginContext loginContext = null; + try { + final String principal = "foo"; + final File workDir = new File(System.getProperty("test.dir", "target")); + final File keytab = new File(workDir, "foo.keytab"); + final Set principals = new HashSet<>(); + principals.add(new KerberosPrincipal(principal)); + setupKdc(); + kdc.createPrincipal(keytab, principal); + + // client login + final Subject subject = + new Subject(false, principals, new HashSet<>(), new HashSet<>()); + + loginContext = new LoginContext("", subject, null, + new javax.security.auth.login.Configuration() { + @Override + public AppConfigurationEntry[] getAppConfigurationEntry( + String name) { + Map options = new HashMap<>(); + options.put("principal", principal); + options.put("refreshKrb5Config", "true"); + if (PlatformName.IBM_JAVA) { + options.put("useKeytab", keytab.getPath()); + options.put("credsType", "both"); + } else { + options.put("keyTab", keytab.getPath()); + options.put("useKeyTab", "true"); + options.put("storeKey", "true"); + options.put("doNotPrompt", "true"); + options.put("useTicketCache", "true"); + options.put("renewTGT", "true"); + options.put("isInitiator", Boolean.toString(true)); + } + String ticketCache = System.getenv("KRB5CCNAME"); + if (ticketCache != null) { + options.put("ticketCache", ticketCache); + } + options.put("debug", "true"); + return new AppConfigurationEntry[] {new AppConfigurationEntry( + KerberosUtil.getKrb5LoginModuleName(), + AppConfigurationEntry.LoginModuleControlFlag.REQUIRED, + options)}; + } + }); + loginContext.login(); + final Subject loginSubject = loginContext.getSubject(); + UserGroupInformation.loginUserFromSubject(loginSubject); + + // Verify retry happens. Do not verify retry count to reduce flakiness. + // Detailed back-off logic is tested separately in + // TestUserGroupInformation#testGetNextRetryTime + LambdaTestUtils.await(30000, 500, + () -> { + final int count = + UserGroupInformation.metrics.getRenewalFailures().value(); + UserGroupInformation.LOG.info("Renew failure count is {}", count); + return count > 0; + }); + } finally { + if (loginContext != null) { + loginContext.logout(); + } + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java index a52cd46f82..ed0f58c415 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java @@ -20,6 +20,8 @@ import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.retry.RetryPolicies; +import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.ipc.TestRpcBase.TestTokenIdentifier; import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.security.SaslRpcServer.AuthMethod; @@ -27,13 +29,18 @@ import org.apache.hadoop.security.authentication.util.KerberosName; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.TokenIdentifier; +import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Time; +import org.apache.log4j.Level; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.security.auth.Subject; import javax.security.auth.kerberos.KerberosPrincipal; @@ -49,10 +56,13 @@ import java.security.PrivilegedExceptionAction; import java.util.Collection; import java.util.ConcurrentModificationException; +import java.util.Date; import java.util.LinkedHashSet; import java.util.Set; +import java.util.concurrent.TimeUnit; import static org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTH_TO_LOCAL; import static org.apache.hadoop.test.MetricsAsserts.assertCounter; import static org.apache.hadoop.test.MetricsAsserts.assertCounterGt; @@ -74,6 +84,9 @@ import static org.mockito.Mockito.when; public class TestUserGroupInformation { + + static final Logger LOG = LoggerFactory.getLogger( + TestUserGroupInformation.class); final private static String USER_NAME = "user1@HADOOP.APACHE.ORG"; final private static String GROUP1_NAME = "group1"; final private static String GROUP2_NAME = "group2"; @@ -1027,6 +1040,86 @@ public Void run() throws IOException { return null; } }); + } + @Test + public void testGetNextRetryTime() throws Exception { + GenericTestUtils.setLogLevel(UserGroupInformation.LOG, Level.DEBUG); + final long reloginInterval = 1; + final long reloginIntervalMs = reloginInterval * 1000; + // Relogin happens every 1 second. + conf.setLong(HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN, reloginInterval); + SecurityUtil.setAuthenticationMethod(AuthenticationMethod.KERBEROS, conf); + UserGroupInformation.setConfiguration(conf); + + // Suppose tgt start time is now, end time is 20 seconds from now. + final long now = Time.now(); + final Date endDate = new Date(now + 20000); + + // Explicitly test the exponential back-off logic. + // Suppose some time (10 seconds) passed. + // Verify exponential backoff and max=(login interval before endTime). + final long currentTime = now + 10000; + final long endTime = endDate.getTime(); + + assertEquals(0, UserGroupInformation.metrics.getRenewalFailures().value()); + RetryPolicy rp = RetryPolicies.exponentialBackoffRetry(Long.SIZE - 2, + 1000, TimeUnit.MILLISECONDS); + long lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + assertWithinBounds( + UserGroupInformation.metrics.getRenewalFailures().value(), + lastRetry, reloginIntervalMs, currentTime); + + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + assertWithinBounds( + UserGroupInformation.metrics.getRenewalFailures().value(), + lastRetry, reloginIntervalMs, currentTime); + + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + assertWithinBounds( + UserGroupInformation.metrics.getRenewalFailures().value(), + lastRetry, reloginIntervalMs, currentTime); + + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + assertWithinBounds( + UserGroupInformation.metrics.getRenewalFailures().value(), + lastRetry, reloginIntervalMs, currentTime); + + // last try should be right before expiry. + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + String str = + "5th retry, now:" + currentTime + ", retry:" + lastRetry; + LOG.info(str); + assertEquals(str, endTime - reloginIntervalMs, lastRetry); + + // make sure no more retries after (tgt endTime - login interval). + UserGroupInformation.metrics.getRenewalFailures().incr(); + lastRetry = + UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp); + str = "overflow retry, now:" + currentTime + ", retry:" + lastRetry; + LOG.info(str); + assertEquals(str, endTime - reloginIntervalMs, lastRetry); + } + + private void assertWithinBounds(final int numFailures, final long lastRetry, + final long reloginIntervalMs, long now) { + // shift is 2 to the power of (numFailure). + int shift = numFailures + 1; + final long lower = now + reloginIntervalMs * (long)((1 << shift) * 0.5); + final long upper = now + reloginIntervalMs * (long)((1 << shift) * 1.5); + final String str = new String("Retry#" + (numFailures + 1) + ", now:" + now + + ", lower bound:" + lower + ", upper bound:" + upper + + ", retry:" + lastRetry); + LOG.info(str); + assertTrue(str, lower <= lastRetry && lastRetry < upper); } }