diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java index 856d1136fa..13b3bb72ac 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java @@ -83,6 +83,9 @@ private HddsConfigKeys() { public static final String HDDS_SCM_CHILLMODE_ENABLED = "hdds.scm.chillmode.enabled"; public static final boolean HDDS_SCM_CHILLMODE_ENABLED_DEFAULT = true; + public static final String HDDS_SCM_CHILLMODE_MIN_DATANODE = + "hdds.scm.chillmode.min.datanode"; + public static final int HDDS_SCM_CHILLMODE_MIN_DATANODE_DEFAULT = 1; // % of containers which should have at least one reported replica // before SCM comes out of chill mode. diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml index b7c967d0ae..d7cbd75b6b 100644 --- a/hadoop-hdds/common/src/main/resources/ozone-default.xml +++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml @@ -1164,6 +1164,15 @@ + + hdds.scm.chillmode.min.datanode + 1 + HDDS,SCM,OPERATION + Minimum DataNodes which should be registered to get SCM out of + chill mode. + + + hdds.container.action.max.limit 20 diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMChillModeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMChillModeManager.java index 3c1cc8ff3d..c11a60f6c5 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMChillModeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMChillModeManager.java @@ -20,8 +20,10 @@ import com.google.common.annotations.VisibleForTesting; import java.util.EnumSet; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; @@ -60,14 +62,16 @@ public class SCMChillModeManager implements private Map exitRules = new HashMap(1); private Configuration config; private static final String CONT_EXIT_RULE = "ContainerChillModeRule"; + private static final String DN_EXIT_RULE = "DataNodeChillModeRule"; private final EventQueue eventPublisher; SCMChillModeManager(Configuration conf, List allContainers, EventQueue eventQueue) { this.config = conf; this.eventPublisher = eventQueue; - exitRules - .put(CONT_EXIT_RULE, new ContainerChillModeRule(config, allContainers)); + exitRules.put(CONT_EXIT_RULE, + new ContainerChillModeRule(config, allContainers)); + exitRules.put(DN_EXIT_RULE, new DataNodeChillModeRule(config)); if (!conf.getBoolean(HddsConfigKeys.HDDS_SCM_CHILLMODE_ENABLED, HddsConfigKeys.HDDS_SCM_CHILLMODE_ENABLED_DEFAULT)) { exitChillMode(eventQueue); @@ -120,6 +124,7 @@ public void onMessage( EventPublisher publisher) { if (getInChillMode()) { exitRules.get(CONT_EXIT_RULE).process(nodeRegistrationContainerReport); + exitRules.get(DN_EXIT_RULE).process(nodeRegistrationContainerReport); validateChillModeExitRules(publisher); } } @@ -187,6 +192,9 @@ public boolean validate() { @VisibleForTesting public double getCurrentContainerThreshold() { + if (maxContainer == 0) { + return 1; + } return (containerWithMinReplicas.doubleValue() / maxContainer); } @@ -217,6 +225,57 @@ public void cleanup() { } } + /** + * Class defining Chill mode exit criteria according to number of DataNodes + * registered with SCM. + */ + public class DataNodeChillModeRule implements + ChillModeExitRule { + + // Min DataNodes required to exit chill mode. + private int requiredDns; + private int registeredDns = 0; + // Set to track registered DataNodes. + private HashSet registeredDnSet; + + public DataNodeChillModeRule(Configuration conf) { + requiredDns = conf + .getInt(HddsConfigKeys.HDDS_SCM_CHILLMODE_MIN_DATANODE, + HddsConfigKeys.HDDS_SCM_CHILLMODE_MIN_DATANODE_DEFAULT); + registeredDnSet = new HashSet<>(requiredDns * 2); + } + + @Override + public boolean validate() { + return registeredDns >= requiredDns; + } + + @VisibleForTesting + public double getRegisteredDataNodes() { + return registeredDns; + } + + @Override + public void process(NodeRegistrationContainerReport reportsProto) { + if (requiredDns == 0) { + // No dn check required. + return; + } + + if(inChillMode.get()) { + registeredDnSet.add(reportsProto.getDatanodeDetails().getUuid()); + registeredDns = registeredDnSet.size(); + LOG.info("SCM in chill mode. {} DataNodes registered, {} required.", + registeredDns, requiredDns); + } + } + + @Override + public void cleanup() { + registeredDnSet.clear(); + } + } + @VisibleForTesting public static Logger getLogger() { return LOG; diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/server/TestSCMChillModeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/server/TestSCMChillModeManager.java index 486c604cd0..53d76e64bb 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/server/TestSCMChillModeManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/server/TestSCMChillModeManager.java @@ -45,7 +45,7 @@ public class TestSCMChillModeManager { private List containers; @Rule - public Timeout timeout = new Timeout(1000 * 20); + public Timeout timeout = new Timeout(1000 * 35); @BeforeClass public static void setUp() { @@ -111,6 +111,45 @@ public void testDisableChillMode() { assertFalse(scmChillModeManager.getInChillMode()); } + @Test + public void testChillModeDataNodeExitRule() throws Exception { + containers = new ArrayList<>(); + testChillModeDataNodes(0); + testChillModeDataNodes(3); + testChillModeDataNodes(5); + } + + private void testChillModeDataNodes(int numOfDns) throws Exception { + OzoneConfiguration conf = new OzoneConfiguration(config); + conf.setInt(HddsConfigKeys.HDDS_SCM_CHILLMODE_MIN_DATANODE, numOfDns); + scmChillModeManager = new SCMChillModeManager(conf, containers, queue); + queue.addHandler(SCMEvents.NODE_REGISTRATION_CONT_REPORT, + scmChillModeManager); + // Assert SCM is in Chill mode. + assertTrue(scmChillModeManager.getInChillMode()); + + // Register all DataNodes except last one and assert SCM is in chill mode. + for (int i = 0; i < numOfDns-1; i++) { + queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, + HddsTestUtils.createNodeRegistrationContainerReport(containers)); + assertTrue(scmChillModeManager.getInChillMode()); + assertTrue(scmChillModeManager.getCurrentContainerThreshold() == 1); + } + + if(numOfDns == 0){ + GenericTestUtils.waitFor(() -> { + return scmChillModeManager.getInChillMode(); + }, 10, 1000 * 10); + return; + } + // Register last DataNode and check that SCM is out of Chill mode. + queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, + HddsTestUtils.createNodeRegistrationContainerReport(containers)); + GenericTestUtils.waitFor(() -> { + return scmChillModeManager.getInChillMode(); + }, 10, 1000 * 10); + } + private void testContainerThreshold(List dnContainers, double expectedThreshold) throws Exception {