HDFS-13603: do not propagate ExecutionException and add maxRetries limit to NameNode edek cache warmup (#6774)

This commit is contained in:
Yu Zhang 2024-06-24 09:34:52 -07:00 committed by GitHub
parent d3b98cb1b2
commit b4ddb2d3bb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 102 additions and 27 deletions

View File

@ -1422,6 +1422,9 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final int DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_DEFAULT = 1000; public static final int DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_DEFAULT = 1000;
public static final String DFS_NAMENODE_EDEKCACHELOADER_INITIAL_DELAY_MS_KEY = "dfs.namenode.edekcacheloader.initial.delay.ms"; public static final String DFS_NAMENODE_EDEKCACHELOADER_INITIAL_DELAY_MS_KEY = "dfs.namenode.edekcacheloader.initial.delay.ms";
public static final int DFS_NAMENODE_EDEKCACHELOADER_INITIAL_DELAY_MS_DEFAULT = 3000; public static final int DFS_NAMENODE_EDEKCACHELOADER_INITIAL_DELAY_MS_DEFAULT = 3000;
public static final String DFS_NAMENODE_EDEKCACHELOADER_MAX_RETRIES_KEY =
"dfs.namenode.edekcacheloader.max-retries";
public static final int DFS_NAMENODE_EDEKCACHELOADER_MAX_RETRIES_DEFAULT = 10;
public static final String DFS_NAMENODE_REENCRYPT_SLEEP_INTERVAL_KEY = "dfs.namenode.reencrypt.sleep.interval"; public static final String DFS_NAMENODE_REENCRYPT_SLEEP_INTERVAL_KEY = "dfs.namenode.reencrypt.sleep.interval";
public static final String DFS_NAMENODE_REENCRYPT_SLEEP_INTERVAL_DEFAULT = "1m"; public static final String DFS_NAMENODE_REENCRYPT_SLEEP_INTERVAL_DEFAULT = "1m";
public static final String DFS_NAMENODE_REENCRYPT_BATCH_SIZE_KEY = "dfs.namenode.reencrypt.batch.size"; public static final String DFS_NAMENODE_REENCRYPT_BATCH_SIZE_KEY = "dfs.namenode.reencrypt.batch.size";

View File

@ -533,16 +533,16 @@ static boolean isInAnEZ(final FSDirectory fsd, final INodesInPath iip)
} }
/** /**
* Proactively warm up the edek cache. We'll get all the edek key names, * Best-effort attempt to proactively warm up the edek cache. We'll get all the edek key names,
* then launch up a separate thread to warm them up. * then launch up a separate thread to warm them up. Retries happen if any of keys fail to warm up.
*/ */
static void warmUpEdekCache(final ExecutorService executor, static void warmUpEdekCache(final ExecutorService executor,
final FSDirectory fsd, final int delay, final int interval) { final FSDirectory fsd, final int delay, final int interval, final int maxRetries) {
fsd.readLock(); fsd.readLock();
try { try {
String[] edeks = fsd.ezManager.getKeyNames(); String[] edeks = fsd.ezManager.getKeyNames();
executor.execute( executor.execute(
new EDEKCacheLoader(edeks, fsd.getProvider(), delay, interval)); new EDEKCacheLoader(edeks, fsd.getProvider(), delay, interval, maxRetries));
} finally { } finally {
fsd.readUnlock(); fsd.readUnlock();
} }
@ -557,19 +557,22 @@ static class EDEKCacheLoader implements Runnable {
private final KeyProviderCryptoExtension kp; private final KeyProviderCryptoExtension kp;
private int initialDelay; private int initialDelay;
private int retryInterval; private int retryInterval;
private int maxRetries;
EDEKCacheLoader(final String[] names, final KeyProviderCryptoExtension kp, EDEKCacheLoader(final String[] names, final KeyProviderCryptoExtension kp,
final int delay, final int interval) { final int delay, final int interval, final int maxRetries) {
this.keyNames = names; this.keyNames = names;
this.kp = kp; this.kp = kp;
this.initialDelay = delay; this.initialDelay = delay;
this.retryInterval = interval; this.retryInterval = interval;
this.maxRetries = maxRetries;
} }
@Override @Override
public void run() { public void run() {
NameNode.LOG.info("Warming up {} EDEKs... (initialDelay={}, " NameNode.LOG.info("Warming up {} EDEKs... (initialDelay={}, "
+ "retryInterval={})", keyNames.length, initialDelay, retryInterval); + "retryInterval={}, maxRetries={})", keyNames.length, initialDelay, retryInterval,
maxRetries);
try { try {
Thread.sleep(initialDelay); Thread.sleep(initialDelay);
} catch (InterruptedException ie) { } catch (InterruptedException ie) {
@ -577,42 +580,39 @@ public void run() {
return; return;
} }
final int logCoolDown = 10000; // periodically print error log (if any)
int sinceLastLog = logCoolDown; // always print the first failure
boolean success = false; boolean success = false;
int retryCount = 0;
IOException lastSeenIOE = null; IOException lastSeenIOE = null;
long warmUpEDEKStartTime = monotonicNow(); long warmUpEDEKStartTime = monotonicNow();
while (true) {
while (!success && retryCount < maxRetries) {
try { try {
kp.warmUpEncryptedKeys(keyNames); kp.warmUpEncryptedKeys(keyNames);
NameNode.LOG NameNode.LOG.info("Successfully warmed up {} EDEKs.", keyNames.length);
.info("Successfully warmed up {} EDEKs.", keyNames.length);
success = true; success = true;
break;
} catch (IOException ioe) { } catch (IOException ioe) {
lastSeenIOE = ioe; lastSeenIOE = ioe;
if (sinceLastLog >= logCoolDown) {
NameNode.LOG.info("Failed to warm up EDEKs.", ioe); NameNode.LOG.info("Failed to warm up EDEKs.", ioe);
sinceLastLog = 0;
} else {
NameNode.LOG.debug("Failed to warm up EDEKs.", ioe);
}
} catch (Exception e) { } catch (Exception e) {
NameNode.LOG.error("Cannot warm up EDEKs.", e); NameNode.LOG.error("Cannot warm up EDEKs.", e);
throw e; throw e;
} }
if (!success) {
try { try {
Thread.sleep(retryInterval); Thread.sleep(retryInterval);
} catch (InterruptedException ie) { } catch (InterruptedException ie) {
NameNode.LOG.info("EDEKCacheLoader interrupted during retry."); NameNode.LOG.info("EDEKCacheLoader interrupted during retry.");
break; break;
} }
sinceLastLog += retryInterval; retryCount++;
} }
}
long warmUpEDEKTime = monotonicNow() - warmUpEDEKStartTime; long warmUpEDEKTime = monotonicNow() - warmUpEDEKStartTime;
NameNode.getNameNodeMetrics().addWarmUpEDEKTime(warmUpEDEKTime); NameNode.getNameNodeMetrics().addWarmUpEDEKTime(warmUpEDEKTime);
if (!success) { if (!success) {
NameNode.LOG.warn("Unable to warm up EDEKs."); NameNode.LOG.warn("Max retry {} reached, unable to warm up EDEKs.", maxRetries);
if (lastSeenIOE != null) { if (lastSeenIOE != null) {
NameNode.LOG.warn("Last seen exception:", lastSeenIOE); NameNode.LOG.warn("Last seen exception:", lastSeenIOE);
} }

View File

@ -579,6 +579,7 @@ private boolean isFromProxyUser(CallerContext ctx) {
private ExecutorService edekCacheLoader = null; private ExecutorService edekCacheLoader = null;
private final int edekCacheLoaderDelay; private final int edekCacheLoaderDelay;
private final int edekCacheLoaderInterval; private final int edekCacheLoaderInterval;
private final int edekCacheLoaderMaxRetries;
/** /**
* When an active namenode will roll its own edit log, in # edits * When an active namenode will roll its own edit log, in # edits
@ -1012,6 +1013,9 @@ static FSNamesystem loadFromDisk(Configuration conf) throws IOException {
this.edekCacheLoaderInterval = conf.getInt( this.edekCacheLoaderInterval = conf.getInt(
DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_KEY, DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_KEY,
DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_DEFAULT); DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_DEFAULT);
this.edekCacheLoaderMaxRetries = conf.getInt(
DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_MAX_RETRIES_KEY,
DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_MAX_RETRIES_DEFAULT);
this.leaseRecheckIntervalMs = conf.getLong( this.leaseRecheckIntervalMs = conf.getLong(
DFS_NAMENODE_LEASE_RECHECK_INTERVAL_MS_KEY, DFS_NAMENODE_LEASE_RECHECK_INTERVAL_MS_KEY,
@ -1470,8 +1474,9 @@ void startActiveServices() throws IOException {
new ThreadFactoryBuilder().setDaemon(true) new ThreadFactoryBuilder().setDaemon(true)
.setNameFormat("Warm Up EDEK Cache Thread #%d") .setNameFormat("Warm Up EDEK Cache Thread #%d")
.build()); .build());
FSDirEncryptionZoneOp.warmUpEdekCache(edekCacheLoader, dir, FSDirEncryptionZoneOp
edekCacheLoaderDelay, edekCacheLoaderInterval); .warmUpEdekCache(edekCacheLoader, dir, edekCacheLoaderDelay, edekCacheLoaderInterval,
edekCacheLoaderMaxRetries);
} }
if (blockManager.getSPSManager() != null) { if (blockManager.getSPSManager() != null) {
blockManager.getSPSManager().start(); blockManager.getSPSManager().start();

View File

@ -3614,6 +3614,14 @@
</description> </description>
</property> </property>
<property>
<name>dfs.namenode.edekcacheloader.max-retries</name>
<value>10</value>
<description>When KeyProvider is configured, the max retries allowed to attempt
warm up edek cache if none of key successful on NN start up / become active.
</description>
</property>
<property> <property>
<name>dfs.namenode.reencrypt.sleep.interval</name> <name>dfs.namenode.reencrypt.sleep.interval</name>
<value>1m</value> <value>1m</value>

View File

@ -0,0 +1,59 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.crypto.key.KeyProviderCryptoExtension;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
import org.junit.Test;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
public class TestFSDirEncryptionZoneOp {
@Test
public void testWarmUpEdekCacheRetries() throws IOException {
NameNode.initMetrics(new Configuration(), NamenodeRole.NAMENODE);
final int initialDelay = 100;
final int retryInterval = 100;
final int maxRetries = 2;
KeyProviderCryptoExtension kpMock = mock(KeyProviderCryptoExtension.class);
doThrow(new IOException())
.doThrow(new IOException())
.doAnswer(invocation -> null)
.when(kpMock).warmUpEncryptedKeys(any());
FSDirEncryptionZoneOp.EDEKCacheLoader loader =
new FSDirEncryptionZoneOp.EDEKCacheLoader(new String[] {"edek1", "edek2"}, kpMock,
initialDelay, retryInterval, maxRetries);
loader.run();
verify(kpMock, times(maxRetries)).warmUpEncryptedKeys(any());
}
}