From b4fed58c6ab4f256542c4df3c468eb29d3ada015 Mon Sep 17 00:00:00 2001 From: LiuGuH <444506464@qq.com> Date: Thu, 21 Dec 2023 12:21:59 +0800 Subject: [PATCH] HDFS-17285. RBF: Add a safe mode check period configuration (#6347) Contributed by LiuGuH. Reviewed-by: Inigo Goiri Reviewed-by: Ayush Saxena Signed-off-by: Shilun Fan --- .../hdfs/server/federation/router/RBFConfigKeys.java | 4 ++++ .../federation/router/RouterSafemodeService.java | 4 ++-- .../src/main/resources/hdfs-rbf-default.xml | 11 +++++++++++ .../server/federation/router/TestRouterSafemode.java | 6 +++++- 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RBFConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RBFConfigKeys.java index 3230af8e07..7000a72b3a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RBFConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RBFConfigKeys.java @@ -279,6 +279,10 @@ public class RBFConfigKeys extends CommonConfigurationKeysPublic { FEDERATION_ROUTER_PREFIX + "safemode.expiration"; public static final long DFS_ROUTER_SAFEMODE_EXPIRATION_DEFAULT = 3 * DFS_ROUTER_CACHE_TIME_TO_LIVE_MS_DEFAULT; + public static final String DFS_ROUTER_SAFEMODE_CHECKPERIOD_MS = + FEDERATION_ROUTER_PREFIX + "safemode.checkperiod"; + public static final long DFS_ROUTER_SAFEMODE_CHECKPERIOD_MS_DEFAULT = + TimeUnit.SECONDS.toMillis(5); // HDFS Router-based federation mount table entries /** Maximum number of cache entries to have. */ diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterSafemodeService.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterSafemodeService.java index ce9e1a64c5..35ed854065 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterSafemodeService.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterSafemodeService.java @@ -133,8 +133,8 @@ protected void serviceInit(Configuration conf) throws Exception { // Use same interval as cache update service this.setIntervalMs(conf.getTimeDuration( - RBFConfigKeys.DFS_ROUTER_CACHE_TIME_TO_LIVE_MS, - RBFConfigKeys.DFS_ROUTER_CACHE_TIME_TO_LIVE_MS_DEFAULT, + RBFConfigKeys.DFS_ROUTER_SAFEMODE_CHECKPERIOD_MS, + RBFConfigKeys.DFS_ROUTER_SAFEMODE_CHECKPERIOD_MS_DEFAULT, TimeUnit.MILLISECONDS)); this.startupInterval = conf.getTimeDuration( diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/resources/hdfs-rbf-default.xml b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/resources/hdfs-rbf-default.xml index 8322a72aba..43bd17d75f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/resources/hdfs-rbf-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/resources/hdfs-rbf-default.xml @@ -530,6 +530,17 @@ + + dfs.federation.router.safemode.checkperiod + 5s + + How often the Router should check safe mode. This + setting supports multiple time unit suffixes as described in + dfs.heartbeat.interval. If no suffix is specified then milliseconds is + assumed. + + + dfs.federation.router.monitor.namenode diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterSafemode.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterSafemode.java index 75104bd193..45a0089a31 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterSafemode.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterSafemode.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hdfs.server.federation.router; import static org.apache.hadoop.hdfs.server.federation.router.RBFConfigKeys.DFS_ROUTER_CACHE_TIME_TO_LIVE_MS; +import static org.apache.hadoop.hdfs.server.federation.router.RBFConfigKeys.DFS_ROUTER_SAFEMODE_CHECKPERIOD_MS; import static org.apache.hadoop.hdfs.server.federation.router.RBFConfigKeys.DFS_ROUTER_SAFEMODE_EXPIRATION; import static org.apache.hadoop.hdfs.server.federation.router.RBFConfigKeys.DFS_ROUTER_SAFEMODE_EXTENSION; import static org.apache.hadoop.hdfs.server.federation.store.FederationStateStoreTestUtils.deleteStateStore; @@ -70,6 +71,9 @@ public static void create() throws IOException { // 200 ms cache refresh conf.setTimeDuration(DFS_ROUTER_CACHE_TIME_TO_LIVE_MS, 200, TimeUnit.MILLISECONDS); + // 100 ms safemode checkperiod + conf.setTimeDuration(DFS_ROUTER_SAFEMODE_CHECKPERIOD_MS, + 100, TimeUnit.MILLISECONDS); // 1 sec post cache update before entering safemode (2 intervals) conf.setTimeDuration(DFS_ROUTER_SAFEMODE_EXPIRATION, TimeUnit.SECONDS.toMillis(1), TimeUnit.MILLISECONDS); @@ -133,7 +137,7 @@ public void testRouterExitSafemode() long interval = conf.getTimeDuration(DFS_ROUTER_SAFEMODE_EXTENSION, TimeUnit.SECONDS.toMillis(2), TimeUnit.MILLISECONDS) + - conf.getTimeDuration(DFS_ROUTER_CACHE_TIME_TO_LIVE_MS, + conf.getTimeDuration(DFS_ROUTER_SAFEMODE_CHECKPERIOD_MS, TimeUnit.SECONDS.toMillis(1), TimeUnit.MILLISECONDS); Thread.sleep(interval);