HDDS-571. Update SCM chill mode exit criteria to optionally wait for n datanodes. Contributed by Ajay Kumar.
This commit is contained in:
parent
9bb2801e8c
commit
cdf5d58364
@ -83,6 +83,9 @@ private HddsConfigKeys() {
|
||||
public static final String HDDS_SCM_CHILLMODE_ENABLED =
|
||||
"hdds.scm.chillmode.enabled";
|
||||
public static final boolean HDDS_SCM_CHILLMODE_ENABLED_DEFAULT = true;
|
||||
public static final String HDDS_SCM_CHILLMODE_MIN_DATANODE =
|
||||
"hdds.scm.chillmode.min.datanode";
|
||||
public static final int HDDS_SCM_CHILLMODE_MIN_DATANODE_DEFAULT = 1;
|
||||
|
||||
// % of containers which should have at least one reported replica
|
||||
// before SCM comes out of chill mode.
|
||||
|
@ -1164,6 +1164,15 @@
|
||||
</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>hdds.scm.chillmode.min.datanode</name>
|
||||
<value>1</value>
|
||||
<tag>HDDS,SCM,OPERATION</tag>
|
||||
<description>Minimum DataNodes which should be registered to get SCM out of
|
||||
chill mode.
|
||||
</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>hdds.container.action.max.limit</name>
|
||||
<value>20</value>
|
||||
|
@ -20,8 +20,10 @@
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import java.util.EnumSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
@ -60,14 +62,16 @@ public class SCMChillModeManager implements
|
||||
private Map<String, ChillModeExitRule> exitRules = new HashMap(1);
|
||||
private Configuration config;
|
||||
private static final String CONT_EXIT_RULE = "ContainerChillModeRule";
|
||||
private static final String DN_EXIT_RULE = "DataNodeChillModeRule";
|
||||
private final EventQueue eventPublisher;
|
||||
|
||||
SCMChillModeManager(Configuration conf, List<ContainerInfo> allContainers,
|
||||
EventQueue eventQueue) {
|
||||
this.config = conf;
|
||||
this.eventPublisher = eventQueue;
|
||||
exitRules
|
||||
.put(CONT_EXIT_RULE, new ContainerChillModeRule(config, allContainers));
|
||||
exitRules.put(CONT_EXIT_RULE,
|
||||
new ContainerChillModeRule(config, allContainers));
|
||||
exitRules.put(DN_EXIT_RULE, new DataNodeChillModeRule(config));
|
||||
if (!conf.getBoolean(HddsConfigKeys.HDDS_SCM_CHILLMODE_ENABLED,
|
||||
HddsConfigKeys.HDDS_SCM_CHILLMODE_ENABLED_DEFAULT)) {
|
||||
exitChillMode(eventQueue);
|
||||
@ -120,6 +124,7 @@ public void onMessage(
|
||||
EventPublisher publisher) {
|
||||
if (getInChillMode()) {
|
||||
exitRules.get(CONT_EXIT_RULE).process(nodeRegistrationContainerReport);
|
||||
exitRules.get(DN_EXIT_RULE).process(nodeRegistrationContainerReport);
|
||||
validateChillModeExitRules(publisher);
|
||||
}
|
||||
}
|
||||
@ -187,6 +192,9 @@ public boolean validate() {
|
||||
|
||||
@VisibleForTesting
|
||||
public double getCurrentContainerThreshold() {
|
||||
if (maxContainer == 0) {
|
||||
return 1;
|
||||
}
|
||||
return (containerWithMinReplicas.doubleValue() / maxContainer);
|
||||
}
|
||||
|
||||
@ -217,6 +225,57 @@ public void cleanup() {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Class defining Chill mode exit criteria according to number of DataNodes
|
||||
* registered with SCM.
|
||||
*/
|
||||
public class DataNodeChillModeRule implements
|
||||
ChillModeExitRule<NodeRegistrationContainerReport> {
|
||||
|
||||
// Min DataNodes required to exit chill mode.
|
||||
private int requiredDns;
|
||||
private int registeredDns = 0;
|
||||
// Set to track registered DataNodes.
|
||||
private HashSet<UUID> registeredDnSet;
|
||||
|
||||
public DataNodeChillModeRule(Configuration conf) {
|
||||
requiredDns = conf
|
||||
.getInt(HddsConfigKeys.HDDS_SCM_CHILLMODE_MIN_DATANODE,
|
||||
HddsConfigKeys.HDDS_SCM_CHILLMODE_MIN_DATANODE_DEFAULT);
|
||||
registeredDnSet = new HashSet<>(requiredDns * 2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean validate() {
|
||||
return registeredDns >= requiredDns;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public double getRegisteredDataNodes() {
|
||||
return registeredDns;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(NodeRegistrationContainerReport reportsProto) {
|
||||
if (requiredDns == 0) {
|
||||
// No dn check required.
|
||||
return;
|
||||
}
|
||||
|
||||
if(inChillMode.get()) {
|
||||
registeredDnSet.add(reportsProto.getDatanodeDetails().getUuid());
|
||||
registeredDns = registeredDnSet.size();
|
||||
LOG.info("SCM in chill mode. {} DataNodes registered, {} required.",
|
||||
registeredDns, requiredDns);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cleanup() {
|
||||
registeredDnSet.clear();
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public static Logger getLogger() {
|
||||
return LOG;
|
||||
|
@ -45,7 +45,7 @@ public class TestSCMChillModeManager {
|
||||
private List<ContainerInfo> containers;
|
||||
|
||||
@Rule
|
||||
public Timeout timeout = new Timeout(1000 * 20);
|
||||
public Timeout timeout = new Timeout(1000 * 35);
|
||||
|
||||
@BeforeClass
|
||||
public static void setUp() {
|
||||
@ -111,6 +111,45 @@ public void testDisableChillMode() {
|
||||
assertFalse(scmChillModeManager.getInChillMode());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testChillModeDataNodeExitRule() throws Exception {
|
||||
containers = new ArrayList<>();
|
||||
testChillModeDataNodes(0);
|
||||
testChillModeDataNodes(3);
|
||||
testChillModeDataNodes(5);
|
||||
}
|
||||
|
||||
private void testChillModeDataNodes(int numOfDns) throws Exception {
|
||||
OzoneConfiguration conf = new OzoneConfiguration(config);
|
||||
conf.setInt(HddsConfigKeys.HDDS_SCM_CHILLMODE_MIN_DATANODE, numOfDns);
|
||||
scmChillModeManager = new SCMChillModeManager(conf, containers, queue);
|
||||
queue.addHandler(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
|
||||
scmChillModeManager);
|
||||
// Assert SCM is in Chill mode.
|
||||
assertTrue(scmChillModeManager.getInChillMode());
|
||||
|
||||
// Register all DataNodes except last one and assert SCM is in chill mode.
|
||||
for (int i = 0; i < numOfDns-1; i++) {
|
||||
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
|
||||
HddsTestUtils.createNodeRegistrationContainerReport(containers));
|
||||
assertTrue(scmChillModeManager.getInChillMode());
|
||||
assertTrue(scmChillModeManager.getCurrentContainerThreshold() == 1);
|
||||
}
|
||||
|
||||
if(numOfDns == 0){
|
||||
GenericTestUtils.waitFor(() -> {
|
||||
return scmChillModeManager.getInChillMode();
|
||||
}, 10, 1000 * 10);
|
||||
return;
|
||||
}
|
||||
// Register last DataNode and check that SCM is out of Chill mode.
|
||||
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
|
||||
HddsTestUtils.createNodeRegistrationContainerReport(containers));
|
||||
GenericTestUtils.waitFor(() -> {
|
||||
return scmChillModeManager.getInChillMode();
|
||||
}, 10, 1000 * 10);
|
||||
}
|
||||
|
||||
private void testContainerThreshold(List<ContainerInfo> dnContainers,
|
||||
double expectedThreshold)
|
||||
throws Exception {
|
||||
|
Loading…
Reference in New Issue
Block a user