From b4ddb2d3bba9d139960a0053acec9cbe41ae9737 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Mon, 24 Jun 2024 09:34:52 -0700 Subject: [PATCH] HDFS-13603: do not propagate ExecutionException and add maxRetries limit to NameNode edek cache warmup (#6774) --- .../org/apache/hadoop/hdfs/DFSConfigKeys.java | 3 + .../namenode/FSDirEncryptionZoneOp.java | 50 ++++++++-------- .../hdfs/server/namenode/FSNamesystem.java | 9 ++- .../src/main/resources/hdfs-default.xml | 8 +++ .../namenode/TestFSDirEncryptionZoneOp.java | 59 +++++++++++++++++++ 5 files changed, 102 insertions(+), 27 deletions(-) create mode 100644 hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSDirEncryptionZoneOp.java diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index 7d136c5c83..b9f8e07f67 100755 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -1422,6 +1422,9 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final int DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_DEFAULT = 1000; public static final String DFS_NAMENODE_EDEKCACHELOADER_INITIAL_DELAY_MS_KEY = "dfs.namenode.edekcacheloader.initial.delay.ms"; public static final int DFS_NAMENODE_EDEKCACHELOADER_INITIAL_DELAY_MS_DEFAULT = 3000; + public static final String DFS_NAMENODE_EDEKCACHELOADER_MAX_RETRIES_KEY = + "dfs.namenode.edekcacheloader.max-retries"; + public static final int DFS_NAMENODE_EDEKCACHELOADER_MAX_RETRIES_DEFAULT = 10; public static final String DFS_NAMENODE_REENCRYPT_SLEEP_INTERVAL_KEY = "dfs.namenode.reencrypt.sleep.interval"; public static final String DFS_NAMENODE_REENCRYPT_SLEEP_INTERVAL_DEFAULT = "1m"; public static final String DFS_NAMENODE_REENCRYPT_BATCH_SIZE_KEY = "dfs.namenode.reencrypt.batch.size"; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirEncryptionZoneOp.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirEncryptionZoneOp.java index 2110a408b0..34c4216bdc 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirEncryptionZoneOp.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirEncryptionZoneOp.java @@ -533,16 +533,16 @@ static boolean isInAnEZ(final FSDirectory fsd, final INodesInPath iip) } /** - * Proactively warm up the edek cache. We'll get all the edek key names, - * then launch up a separate thread to warm them up. + * Best-effort attempt to proactively warm up the edek cache. We'll get all the edek key names, + * then launch up a separate thread to warm them up. Retries happen if any of keys fail to warm up. */ static void warmUpEdekCache(final ExecutorService executor, - final FSDirectory fsd, final int delay, final int interval) { + final FSDirectory fsd, final int delay, final int interval, final int maxRetries) { fsd.readLock(); try { String[] edeks = fsd.ezManager.getKeyNames(); executor.execute( - new EDEKCacheLoader(edeks, fsd.getProvider(), delay, interval)); + new EDEKCacheLoader(edeks, fsd.getProvider(), delay, interval, maxRetries)); } finally { fsd.readUnlock(); } @@ -557,19 +557,22 @@ static class EDEKCacheLoader implements Runnable { private final KeyProviderCryptoExtension kp; private int initialDelay; private int retryInterval; + private int maxRetries; EDEKCacheLoader(final String[] names, final KeyProviderCryptoExtension kp, - final int delay, final int interval) { + final int delay, final int interval, final int maxRetries) { this.keyNames = names; this.kp = kp; this.initialDelay = delay; this.retryInterval = interval; + this.maxRetries = maxRetries; } @Override public void run() { NameNode.LOG.info("Warming up {} EDEKs... (initialDelay={}, " - + "retryInterval={})", keyNames.length, initialDelay, retryInterval); + + "retryInterval={}, maxRetries={})", keyNames.length, initialDelay, retryInterval, + maxRetries); try { Thread.sleep(initialDelay); } catch (InterruptedException ie) { @@ -577,42 +580,39 @@ public void run() { return; } - final int logCoolDown = 10000; // periodically print error log (if any) - int sinceLastLog = logCoolDown; // always print the first failure boolean success = false; + int retryCount = 0; IOException lastSeenIOE = null; long warmUpEDEKStartTime = monotonicNow(); - while (true) { + + while (!success && retryCount < maxRetries) { try { kp.warmUpEncryptedKeys(keyNames); - NameNode.LOG - .info("Successfully warmed up {} EDEKs.", keyNames.length); + NameNode.LOG.info("Successfully warmed up {} EDEKs.", keyNames.length); success = true; - break; } catch (IOException ioe) { lastSeenIOE = ioe; - if (sinceLastLog >= logCoolDown) { - NameNode.LOG.info("Failed to warm up EDEKs.", ioe); - sinceLastLog = 0; - } else { - NameNode.LOG.debug("Failed to warm up EDEKs.", ioe); - } + NameNode.LOG.info("Failed to warm up EDEKs.", ioe); } catch (Exception e) { NameNode.LOG.error("Cannot warm up EDEKs.", e); throw e; } - try { - Thread.sleep(retryInterval); - } catch (InterruptedException ie) { - NameNode.LOG.info("EDEKCacheLoader interrupted during retry."); - break; + + if (!success) { + try { + Thread.sleep(retryInterval); + } catch (InterruptedException ie) { + NameNode.LOG.info("EDEKCacheLoader interrupted during retry."); + break; + } + retryCount++; } - sinceLastLog += retryInterval; } + long warmUpEDEKTime = monotonicNow() - warmUpEDEKStartTime; NameNode.getNameNodeMetrics().addWarmUpEDEKTime(warmUpEDEKTime); if (!success) { - NameNode.LOG.warn("Unable to warm up EDEKs."); + NameNode.LOG.warn("Max retry {} reached, unable to warm up EDEKs.", maxRetries); if (lastSeenIOE != null) { NameNode.LOG.warn("Last seen exception:", lastSeenIOE); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 6e5117f4db..edc790dbc3 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -579,6 +579,7 @@ private boolean isFromProxyUser(CallerContext ctx) { private ExecutorService edekCacheLoader = null; private final int edekCacheLoaderDelay; private final int edekCacheLoaderInterval; + private final int edekCacheLoaderMaxRetries; /** * When an active namenode will roll its own edit log, in # edits @@ -1012,6 +1013,9 @@ static FSNamesystem loadFromDisk(Configuration conf) throws IOException { this.edekCacheLoaderInterval = conf.getInt( DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_KEY, DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_DEFAULT); + this.edekCacheLoaderMaxRetries = conf.getInt( + DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_MAX_RETRIES_KEY, + DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_MAX_RETRIES_DEFAULT); this.leaseRecheckIntervalMs = conf.getLong( DFS_NAMENODE_LEASE_RECHECK_INTERVAL_MS_KEY, @@ -1470,8 +1474,9 @@ void startActiveServices() throws IOException { new ThreadFactoryBuilder().setDaemon(true) .setNameFormat("Warm Up EDEK Cache Thread #%d") .build()); - FSDirEncryptionZoneOp.warmUpEdekCache(edekCacheLoader, dir, - edekCacheLoaderDelay, edekCacheLoaderInterval); + FSDirEncryptionZoneOp + .warmUpEdekCache(edekCacheLoader, dir, edekCacheLoaderDelay, edekCacheLoaderInterval, + edekCacheLoaderMaxRetries); } if (blockManager.getSPSManager() != null) { blockManager.getSPSManager().start(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml index 3e362de198..94c3ea0cc9 100755 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml @@ -3614,6 +3614,14 @@ + + dfs.namenode.edekcacheloader.max-retries + 10 + When KeyProvider is configured, the max retries allowed to attempt + warm up edek cache if none of key successful on NN start up / become active. + + + dfs.namenode.reencrypt.sleep.interval 1m diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSDirEncryptionZoneOp.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSDirEncryptionZoneOp.java new file mode 100644 index 0000000000..2fa6a33f0b --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSDirEncryptionZoneOp.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.crypto.key.KeyProviderCryptoExtension; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole; + +import org.junit.Test; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +public class TestFSDirEncryptionZoneOp { + + @Test + public void testWarmUpEdekCacheRetries() throws IOException { + NameNode.initMetrics(new Configuration(), NamenodeRole.NAMENODE); + + final int initialDelay = 100; + final int retryInterval = 100; + final int maxRetries = 2; + + KeyProviderCryptoExtension kpMock = mock(KeyProviderCryptoExtension.class); + + doThrow(new IOException()) + .doThrow(new IOException()) + .doAnswer(invocation -> null) + .when(kpMock).warmUpEncryptedKeys(any()); + + FSDirEncryptionZoneOp.EDEKCacheLoader loader = + new FSDirEncryptionZoneOp.EDEKCacheLoader(new String[] {"edek1", "edek2"}, kpMock, + initialDelay, retryInterval, maxRetries); + + loader.run(); + + verify(kpMock, times(maxRetries)).warmUpEncryptedKeys(any()); + } +} \ No newline at end of file