HDDS-1285. Implement actions need to be taken after chill mode exit wait time. (#612)

This commit is contained in:
Bharat Viswanadham 2019-04-12 09:40:27 -07:00 committed by GitHub
parent abace709cc
commit a458c5bd96
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 267 additions and 12 deletions

View File

@ -21,11 +21,17 @@
import org.apache.hadoop.hdds.HddsConfigKeys; import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.scm.block.BlockManager; import org.apache.hadoop.hdds.scm.block.BlockManager;
import org.apache.hadoop.hdds.scm.container.ReplicationManager; import org.apache.hadoop.hdds.scm.container.ReplicationManager;
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
import org.apache.hadoop.hdds.scm.server.SCMClientProtocolServer; import org.apache.hadoop.hdds.scm.server.SCMClientProtocolServer;
import org.apache.hadoop.hdds.scm.chillmode.SCMChillModeManager.ChillModeStatus; import org.apache.hadoop.hdds.scm.chillmode.SCMChillModeManager.ChillModeStatus;
import org.apache.hadoop.hdds.server.events.EventHandler; import org.apache.hadoop.hdds.server.events.EventHandler;
import org.apache.hadoop.hdds.server.events.EventPublisher; import org.apache.hadoop.hdds.server.events.EventPublisher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
@ -36,12 +42,16 @@
*/ */
public class ChillModeHandler implements EventHandler<ChillModeStatus> { public class ChillModeHandler implements EventHandler<ChillModeStatus> {
private static final Logger LOG =
LoggerFactory.getLogger(ChillModeHandler.class);
private final SCMClientProtocolServer scmClientProtocolServer; private final SCMClientProtocolServer scmClientProtocolServer;
private final BlockManager scmBlockManager; private final BlockManager scmBlockManager;
private final long waitTime; private final long waitTime;
private final AtomicBoolean isInChillMode = new AtomicBoolean(true); private final AtomicBoolean isInChillMode = new AtomicBoolean(true);
private final ReplicationManager replicationManager; private final ReplicationManager replicationManager;
private final PipelineManager scmPipelineManager;
/** /**
* ChillModeHandler, to handle the logic once we exit chill mode. * ChillModeHandler, to handle the logic once we exit chill mode.
@ -53,13 +63,15 @@ public class ChillModeHandler implements EventHandler<ChillModeStatus> {
public ChillModeHandler(Configuration configuration, public ChillModeHandler(Configuration configuration,
SCMClientProtocolServer clientProtocolServer, SCMClientProtocolServer clientProtocolServer,
BlockManager blockManager, BlockManager blockManager,
ReplicationManager replicationManager) { ReplicationManager replicationManager, PipelineManager pipelineManager) {
Objects.requireNonNull(configuration, "Configuration cannot be null"); Objects.requireNonNull(configuration, "Configuration cannot be null");
Objects.requireNonNull(clientProtocolServer, "SCMClientProtocolServer " + Objects.requireNonNull(clientProtocolServer, "SCMClientProtocolServer " +
"object cannot be null"); "object cannot be null");
Objects.requireNonNull(blockManager, "BlockManager object cannot be null"); Objects.requireNonNull(blockManager, "BlockManager object cannot be null");
Objects.requireNonNull(replicationManager, "ReplicationManager " + Objects.requireNonNull(replicationManager, "ReplicationManager " +
"object cannot be null"); "object cannot be null");
Objects.requireNonNull(pipelineManager, "PipelineManager object cannot " +
"be" + "null");
this.waitTime = configuration.getTimeDuration( this.waitTime = configuration.getTimeDuration(
HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_CHILL_MODE_EXIT, HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_CHILL_MODE_EXIT,
HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_CHILL_MODE_EXIT_DEFAULT, HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_CHILL_MODE_EXIT_DEFAULT,
@ -67,6 +79,7 @@ public ChillModeHandler(Configuration configuration,
this.scmClientProtocolServer = clientProtocolServer; this.scmClientProtocolServer = clientProtocolServer;
this.scmBlockManager = blockManager; this.scmBlockManager = blockManager;
this.replicationManager = replicationManager; this.replicationManager = replicationManager;
this.scmPipelineManager = pipelineManager;
final boolean chillModeEnabled = configuration.getBoolean( final boolean chillModeEnabled = configuration.getBoolean(
HddsConfigKeys.HDDS_SCM_CHILLMODE_ENABLED, HddsConfigKeys.HDDS_SCM_CHILLMODE_ENABLED,
@ -75,6 +88,8 @@ public ChillModeHandler(Configuration configuration,
} }
/** /**
* Set ChillMode status based on * Set ChillMode status based on
* {@link org.apache.hadoop.hdds.scm.events.SCMEvents#CHILL_MODE_STATUS}. * {@link org.apache.hadoop.hdds.scm.events.SCMEvents#CHILL_MODE_STATUS}.
@ -101,6 +116,7 @@ public void onMessage(ChillModeStatus chillModeStatus,
Thread.currentThread().interrupt(); Thread.currentThread().interrupt();
} }
replicationManager.start(); replicationManager.start();
cleanupPipelines();
}); });
chillModeExitThread.setDaemon(true); chillModeExitThread.setDaemon(true);
@ -109,6 +125,20 @@ public void onMessage(ChillModeStatus chillModeStatus,
} }
private void cleanupPipelines() {
List<Pipeline> pipelineList = scmPipelineManager.getPipelines();
pipelineList.forEach((pipeline) -> {
try {
if (pipeline.getPipelineState() == Pipeline.PipelineState.ALLOCATED) {
scmPipelineManager.finalizeAndDestroyPipeline(pipeline, false);
}
} catch (IOException ex) {
LOG.error("Finalize and destroy pipeline failed for pipeline "
+ pipeline.toString(), ex);
}
});
}
public boolean getChillModeStatus() { public boolean getChillModeStatus() {
return isInChillMode.get(); return isInChillMode.get();
} }

View File

@ -335,7 +335,8 @@ public StorageContainerManager(OzoneConfiguration conf,
httpServer = new StorageContainerManagerHttpServer(conf); httpServer = new StorageContainerManagerHttpServer(conf);
chillModeHandler = new ChillModeHandler(configuration, chillModeHandler = new ChillModeHandler(configuration,
clientProtocolServer, scmBlockManager, replicationManager); clientProtocolServer, scmBlockManager, replicationManager,
pipelineManager);
eventQueue.addHandler(SCMEvents.DATANODE_COMMAND, scmNodeManager); eventQueue.addHandler(SCMEvents.DATANODE_COMMAND, scmNodeManager);
eventQueue.addHandler(SCMEvents.RETRIABLE_DATANODE_COMMAND, scmNodeManager); eventQueue.addHandler(SCMEvents.RETRIABLE_DATANODE_COMMAND, scmNodeManager);
@ -1077,6 +1078,16 @@ public ChillModeHandler getChillModeHandler() {
return chillModeHandler; return chillModeHandler;
} }
@VisibleForTesting
public SCMChillModeManager getScmChillModeManager() {
return scmChillModeManager;
}
@VisibleForTesting
public ReplicationManager getReplicationManager() {
return replicationManager;
}
public void checkAdminAccess(String remoteUser) throws IOException { public void checkAdminAccess(String remoteUser) throws IOException {
if (remoteUser != null) { if (remoteUser != null) {
if (!scmAdminUsernames.contains(remoteUser)) { if (!scmAdminUsernames.contains(remoteUser)) {

View File

@ -27,6 +27,8 @@
import org.apache.hadoop.hdds.scm.container.placement.algorithms import org.apache.hadoop.hdds.scm.container.placement.algorithms
.ContainerPlacementPolicy; .ContainerPlacementPolicy;
import org.apache.hadoop.hdds.scm.events.SCMEvents; import org.apache.hadoop.hdds.scm.events.SCMEvents;
import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
import org.apache.hadoop.hdds.scm.pipeline.SCMPipelineManager;
import org.apache.hadoop.hdds.scm.server.SCMClientProtocolServer; import org.apache.hadoop.hdds.scm.server.SCMClientProtocolServer;
import org.apache.hadoop.hdds.server.events.EventQueue; import org.apache.hadoop.hdds.server.events.EventQueue;
import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.GenericTestUtils;
@ -49,6 +51,7 @@ public class TestChillModeHandler {
private ChillModeHandler chillModeHandler; private ChillModeHandler chillModeHandler;
private EventQueue eventQueue; private EventQueue eventQueue;
private SCMChillModeManager.ChillModeStatus chillModeStatus; private SCMChillModeManager.ChillModeStatus chillModeStatus;
private PipelineManager scmPipelineManager;
public void setup(boolean enabled) { public void setup(boolean enabled) {
configuration = new OzoneConfiguration(); configuration = new OzoneConfiguration();
@ -66,10 +69,11 @@ public void setup(boolean enabled) {
replicationManager = new ReplicationManager(configuration, replicationManager = new ReplicationManager(configuration,
containerManager, Mockito.mock(ContainerPlacementPolicy.class), containerManager, Mockito.mock(ContainerPlacementPolicy.class),
eventQueue); eventQueue);
scmPipelineManager = Mockito.mock(SCMPipelineManager.class);
blockManager = Mockito.mock(BlockManagerImpl.class); blockManager = Mockito.mock(BlockManagerImpl.class);
chillModeHandler = chillModeHandler =
new ChillModeHandler(configuration, scmClientProtocolServer, new ChillModeHandler(configuration, scmClientProtocolServer,
blockManager, replicationManager); blockManager, replicationManager, scmPipelineManager);
eventQueue.addHandler(SCMEvents.CHILL_MODE_STATUS, chillModeHandler); eventQueue.addHandler(SCMEvents.CHILL_MODE_STATUS, chillModeHandler);
chillModeStatus = new SCMChillModeManager.ChillModeStatus(false); chillModeStatus = new SCMChillModeManager.ChillModeStatus(false);

View File

@ -27,6 +27,8 @@
import org.apache.hadoop.hdds.scm.container.ReplicationManager; import org.apache.hadoop.hdds.scm.container.ReplicationManager;
import org.apache.hadoop.hdds.scm.events.SCMEvents; import org.apache.hadoop.hdds.scm.events.SCMEvents;
import org.apache.hadoop.hdds.scm.exceptions.SCMException; import org.apache.hadoop.hdds.scm.exceptions.SCMException;
import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
import org.apache.hadoop.hdds.scm.pipeline.SCMPipelineManager;
import org.apache.hadoop.hdds.server.events.EventQueue; import org.apache.hadoop.hdds.server.events.EventQueue;
import org.apache.hadoop.test.LambdaTestUtils; import org.apache.hadoop.test.LambdaTestUtils;
import org.junit.After; import org.junit.After;
@ -50,8 +52,10 @@ public void setUp() throws Exception {
BlockManager blockManager = Mockito.mock(BlockManagerImpl.class); BlockManager blockManager = Mockito.mock(BlockManagerImpl.class);
ReplicationManager replicationManager = ReplicationManager replicationManager =
Mockito.mock(ReplicationManager.class); Mockito.mock(ReplicationManager.class);
PipelineManager pipelineManager = Mockito.mock(SCMPipelineManager.class);
ChillModeHandler chillModeHandler = new ChillModeHandler(config, ChillModeHandler chillModeHandler = new ChillModeHandler(config,
scmClientProtocolServer, blockManager, replicationManager); scmClientProtocolServer, blockManager, replicationManager,
pipelineManager);
eventQueue.addHandler(SCMEvents.CHILL_MODE_STATUS, chillModeHandler); eventQueue.addHandler(SCMEvents.CHILL_MODE_STATUS, chillModeHandler);
} }

View File

@ -0,0 +1,202 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdds.scm.chillmode;
import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.scm.container.ReplicationManager;
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
import org.apache.hadoop.ozone.MiniOzoneCluster;
import org.apache.hadoop.test.GenericTestUtils;
import org.junit.After;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.util.List;
import java.util.concurrent.TimeoutException;
import static org.junit.Assert.fail;
/**
* This class tests SCM Chill mode with pipeline rules.
*/
public class TestSCMChillModeWithPipelineRules {
private static MiniOzoneCluster cluster;
private OzoneConfiguration conf = new OzoneConfiguration();
private PipelineManager pipelineManager;
private MiniOzoneCluster.Builder clusterBuilder;
@Rule
public TemporaryFolder temporaryFolder = new TemporaryFolder();
public void setup(int numDatanodes) throws Exception {
conf.set(HddsConfigKeys.OZONE_METADATA_DIRS,
temporaryFolder.newFolder().toString());
conf.setBoolean(
HddsConfigKeys.HDDS_SCM_CHILLMODE_PIPELINE_AVAILABILITY_CHECK,
true);
conf.set(HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_CHILL_MODE_EXIT, "10s");
conf.set(ScmConfigKeys.OZONE_SCM_PIPELINE_CREATION_INTERVAL, "10s");
clusterBuilder = MiniOzoneCluster.newBuilder(conf)
.setNumDatanodes(numDatanodes)
.setHbInterval(1000)
.setHbProcessorInterval(1000);
cluster = clusterBuilder.build();
cluster.waitForClusterToBeReady();
StorageContainerManager scm = cluster.getStorageContainerManager();
pipelineManager = scm.getPipelineManager();
}
@Test
public void testScmChillMode() throws Exception {
int datanodeCount = 6;
setup(datanodeCount);
waitForRatis3NodePipelines(datanodeCount/3);
waitForRatis1NodePipelines(datanodeCount);
int totalPipelineCount = datanodeCount + (datanodeCount/3);
//Cluster is started successfully
cluster.stop();
cluster.restartOzoneManager();
cluster.restartStorageContainerManager(false);
pipelineManager = cluster.getStorageContainerManager().getPipelineManager();
List<Pipeline> pipelineList =
pipelineManager.getPipelines(HddsProtos.ReplicationType.RATIS,
HddsProtos.ReplicationFactor.THREE);
pipelineList.get(0).getNodes().forEach(datanodeDetails -> {
try {
cluster.restartHddsDatanode(datanodeDetails, false);
} catch (Exception ex) {
fail("Datanode restart failed");
}
});
SCMChillModeManager scmChillModeManager =
cluster.getStorageContainerManager().getScmChillModeManager();
// Ceil(0.1 * 2) is 1, as one pipeline is healthy healthy pipeline rule is
// satisfied
GenericTestUtils.waitFor(() ->
scmChillModeManager.getHealthyPipelineChillModeRule()
.validate(), 1000, 60000);
// As Ceil(0.9 * 2) is 2, and from second pipeline no datanodes's are
// reported this rule is not met yet.
GenericTestUtils.waitFor(() ->
!scmChillModeManager.getOneReplicaPipelineChillModeRule()
.validate(), 1000, 60000);
Assert.assertTrue(cluster.getStorageContainerManager().isInChillMode());
DatanodeDetails restartedDatanode = pipelineList.get(1).getFirstNode();
// Now restart one datanode from the 2nd pipeline
try {
cluster.restartHddsDatanode(restartedDatanode, false);
} catch (Exception ex) {
fail("Datanode restart failed");
}
GenericTestUtils.waitFor(() ->
scmChillModeManager.getOneReplicaPipelineChillModeRule()
.validate(), 1000, 60000);
GenericTestUtils.waitFor(() -> !scmChillModeManager.getInChillMode(), 1000,
60000);
// As after chillmode wait time is not completed, we should have total
// pipeline's as original count 6(1 node pipelines) + 2 (3 node pipeline)
Assert.assertEquals(totalPipelineCount,
pipelineManager.getPipelines().size());
ReplicationManager replicationManager =
cluster.getStorageContainerManager().getReplicationManager();
GenericTestUtils.waitFor(() ->
replicationManager.isRunning(), 1000, 60000);
// As 4 datanodes are reported, 4 single node pipeline and 1 3 node
// pipeline.
waitForRatis1NodePipelines(4);
waitForRatis3NodePipelines(1);
// Restart other datanodes in the pipeline, and after some time we should
// have same count as original.
pipelineList.get(1).getNodes().forEach(datanodeDetails -> {
try {
if (!restartedDatanode.equals(datanodeDetails)) {
cluster.restartHddsDatanode(datanodeDetails, false);
}
} catch (Exception ex) {
fail("Datanode restart failed");
}
});
waitForRatis1NodePipelines(datanodeCount);
waitForRatis3NodePipelines(datanodeCount/3);
}
@After
public void tearDown() {
if (cluster != null) {
cluster.shutdown();
}
}
private void waitForRatis3NodePipelines(int numPipelines)
throws TimeoutException, InterruptedException {
GenericTestUtils.waitFor(() -> pipelineManager
.getPipelines(HddsProtos.ReplicationType.RATIS,
HddsProtos.ReplicationFactor.THREE, Pipeline.PipelineState.OPEN)
.size() == numPipelines, 100, 60000);
}
private void waitForRatis1NodePipelines(int numPipelines)
throws TimeoutException, InterruptedException {
GenericTestUtils.waitFor(() -> pipelineManager
.getPipelines(HddsProtos.ReplicationType.RATIS,
HddsProtos.ReplicationFactor.ONE, Pipeline.PipelineState.OPEN)
.size() == numPipelines, 100, 60000);
}
}

View File

@ -140,7 +140,7 @@ public void testContainerStateManagerRestart() throws IOException,
} }
} }
cluster.restartStorageContainerManager(); cluster.restartStorageContainerManager(true);
List<ContainerInfo> result = cluster.getStorageContainerManager() List<ContainerInfo> result = cluster.getStorageContainerManager()
.getContainerManager().listContainer(null, 100); .getContainerManager().listContainer(null, 100);

View File

@ -75,7 +75,7 @@ public static void init() throws Exception {
// At this stage, there should be 2 pipeline one with 1 open container // At this stage, there should be 2 pipeline one with 1 open container
// each. Try restarting the SCM and then discover that pipeline are in // each. Try restarting the SCM and then discover that pipeline are in
// correct state. // correct state.
cluster.restartStorageContainerManager(); cluster.restartStorageContainerManager(true);
newContainerManager = cluster.getStorageContainerManager() newContainerManager = cluster.getStorageContainerManager()
.getContainerManager(); .getContainerManager();
pipelineManager = cluster.getStorageContainerManager().getPipelineManager(); pipelineManager = cluster.getStorageContainerManager().getPipelineManager();

View File

@ -159,12 +159,14 @@ static Builder newHABuilder(OzoneConfiguration conf) {
/** /**
* Restarts StorageContainerManager instance. * Restarts StorageContainerManager instance.
* *
* @param waitForDatanode
* @throws IOException * @throws IOException
* @throws TimeoutException * @throws TimeoutException
* @throws InterruptedException * @throws InterruptedException
*/ */
void restartStorageContainerManager() throws InterruptedException, void restartStorageContainerManager(boolean waitForDatanode)
TimeoutException, IOException, AuthenticationException; throws InterruptedException, TimeoutException, IOException,
AuthenticationException;
/** /**
* Restarts OzoneManager instance. * Restarts OzoneManager instance.

View File

@ -250,14 +250,16 @@ public OzoneClient getRestClient() throws IOException {
} }
@Override @Override
public void restartStorageContainerManager() public void restartStorageContainerManager(boolean waitForDatanode)
throws TimeoutException, InterruptedException, IOException, throws TimeoutException, InterruptedException, IOException,
AuthenticationException { AuthenticationException {
scm.stop(); scm.stop();
scm.join(); scm.join();
scm = StorageContainerManager.createSCM(null, conf); scm = StorageContainerManager.createSCM(null, conf);
scm.start(); scm.start();
waitForClusterToBeReady(); if (waitForDatanode) {
waitForClusterToBeReady();
}
} }
@Override @Override

View File

@ -260,7 +260,7 @@ public void testDNstartAfterSCM() throws Exception {
// DN should successfully register with the SCM after SCM is restarted. // DN should successfully register with the SCM after SCM is restarted.
// Restart the SCM // Restart the SCM
cluster.restartStorageContainerManager(); cluster.restartStorageContainerManager(true);
// Wait for DN to register // Wait for DN to register
cluster.waitForClusterToBeReady(); cluster.waitForClusterToBeReady();
// DN should be in HEARTBEAT state after registering with the SCM // DN should be in HEARTBEAT state after registering with the SCM

View File

@ -342,7 +342,7 @@ public void testSCMChillModeDisabled() throws Exception {
assertFalse(scm.isInChillMode()); assertFalse(scm.isInChillMode());
// Even on SCM restart, cluster should be out of chill mode immediately. // Even on SCM restart, cluster should be out of chill mode immediately.
cluster.restartStorageContainerManager(); cluster.restartStorageContainerManager(true);
assertFalse(scm.isInChillMode()); assertFalse(scm.isInChillMode());
} }
} }