HDDS-1497. Refactor blockade Tests. Contributed by Nilotpal Nandi.
This commit is contained in:
parent
8c8cb2d6aa
commit
1b041d4fd4
@ -18,9 +18,8 @@
|
|||||||
"""This module has apis to create and remove a blockade cluster"""
|
"""This module has apis to create and remove a blockade cluster"""
|
||||||
|
|
||||||
from subprocess import call
|
from subprocess import call
|
||||||
import subprocess
|
|
||||||
import logging
|
import logging
|
||||||
import random
|
import util
|
||||||
from clusterUtils.cluster_utils import ClusterUtils
|
from clusterUtils.cluster_utils import ClusterUtils
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -39,23 +38,13 @@ def blockade_up(cls):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def blockade_status(cls):
|
def blockade_status(cls):
|
||||||
exit_code, output = ClusterUtils.run_cmd("blockade status")
|
exit_code, output = util.run_cmd("blockade status")
|
||||||
return exit_code, output
|
return exit_code, output
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def make_flaky(cls, flaky_node, container_list):
|
def make_flaky(cls, flaky_node):
|
||||||
# make the network flaky
|
logger.info("flaky node: %s", flaky_node)
|
||||||
om, scm, _, datanodes = \
|
output = call(["blockade", "flaky", flaky_node])
|
||||||
ClusterUtils.find_om_scm_client_datanodes(container_list)
|
|
||||||
node_dict = {
|
|
||||||
"all": "--all",
|
|
||||||
"scm" : scm[0],
|
|
||||||
"om" : om[0],
|
|
||||||
"datanode": random.choice(datanodes)
|
|
||||||
}[flaky_node]
|
|
||||||
logger.info("flaky node: %s", node_dict)
|
|
||||||
|
|
||||||
output = call(["blockade", "flaky", node_dict])
|
|
||||||
assert output == 0, "flaky command failed with exit code=[%s]" % output
|
assert output == 0, "flaky command failed with exit code=[%s]" % output
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -69,7 +58,7 @@ def blockade_create_partition(cls, *args):
|
|||||||
for node_list in args:
|
for node_list in args:
|
||||||
nodes = nodes + ','.join(node_list) + " "
|
nodes = nodes + ','.join(node_list) + " "
|
||||||
exit_code, output = \
|
exit_code, output = \
|
||||||
ClusterUtils.run_cmd("blockade partition %s" % nodes)
|
util.run_cmd("blockade partition %s" % nodes)
|
||||||
assert exit_code == 0, \
|
assert exit_code == 0, \
|
||||||
"blockade partition command failed with exit code=[%s]" % output
|
"blockade partition command failed with exit code=[%s]" % output
|
||||||
|
|
||||||
@ -95,4 +84,9 @@ def blockade_start(cls, node, all_nodes=False):
|
|||||||
else:
|
else:
|
||||||
output = call(["blockade", "start", node])
|
output = call(["blockade", "start", node])
|
||||||
assert output == 0, "blockade start command failed with " \
|
assert output == 0, "blockade start command failed with " \
|
||||||
"exit code=[%s]" % output
|
"exit code=[%s]" % output
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def blockade_add(cls, node):
|
||||||
|
output = call(["blockade", "add", node])
|
||||||
|
assert output == 0, "blockade add command failed"
|
@ -11,4 +11,4 @@
|
|||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
@ -17,6 +17,7 @@
|
|||||||
|
|
||||||
|
|
||||||
from subprocess import call
|
from subprocess import call
|
||||||
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
@ -292,9 +293,15 @@ def get_key(cls, docker_compose_file, bucket_name, volume_name,
|
|||||||
assert exit_code == 0, "Ozone get Key failed with output=[%s]" % output
|
assert exit_code == 0, "Ozone get Key failed with output=[%s]" % output
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def find_checksum(cls, docker_compose_file, filepath):
|
def find_checksum(cls, docker_compose_file, filepath, client="ozone_client"):
|
||||||
|
"""
|
||||||
|
This function finds the checksum of a file present in a docker container.
|
||||||
|
Before running any 'putKey' operation, this function is called to store
|
||||||
|
the original checksum of the file. The file is then uploaded as a key.
|
||||||
|
"""
|
||||||
command = "docker-compose -f %s " \
|
command = "docker-compose -f %s " \
|
||||||
"exec ozone_client md5sum %s" % (docker_compose_file, filepath)
|
"exec %s md5sum %s" % \
|
||||||
|
(docker_compose_file, client, filepath)
|
||||||
exit_code, output = cls.run_cmd(command)
|
exit_code, output = cls.run_cmd(command)
|
||||||
assert exit_code == 0, "Cant find checksum"
|
assert exit_code == 0, "Cant find checksum"
|
||||||
myoutput = output.split("\n")
|
myoutput = output.split("\n")
|
||||||
|
14
hadoop-ozone/dist/src/main/blockade/ozone/__init__.py
vendored
Normal file
14
hadoop-ozone/dist/src/main/blockade/ozone/__init__.py
vendored
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
295
hadoop-ozone/dist/src/main/blockade/ozone/cluster.py
vendored
Normal file
295
hadoop-ozone/dist/src/main/blockade/ozone/cluster.py
vendored
Normal file
@ -0,0 +1,295 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import yaml
|
||||||
|
import util
|
||||||
|
from subprocess import call
|
||||||
|
from blockadeUtils.blockade import Blockade
|
||||||
|
|
||||||
|
|
||||||
|
class Command(object):
|
||||||
|
docker = "docker"
|
||||||
|
blockade = "blockade"
|
||||||
|
docker_compose = "docker-compose"
|
||||||
|
ozone = "/opt/hadoop/bin/ozone"
|
||||||
|
freon = "/opt/hadoop/bin/ozone freon"
|
||||||
|
|
||||||
|
|
||||||
|
class Configuration:
|
||||||
|
"""
|
||||||
|
Configurations to be used while starting Ozone Cluster.
|
||||||
|
Here @property decorators is used to achieve getters, setters and delete
|
||||||
|
behaviour for 'datanode_count' attribute.
|
||||||
|
@datanode_count.setter will set the value for 'datanode_count' attribute.
|
||||||
|
@datanode_count.deleter will delete the current value of 'datanode_count'
|
||||||
|
attribute.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
__parent_dir__ = os.path.dirname(os.path.dirname(
|
||||||
|
os.path.dirname(os.path.realpath(__file__))))
|
||||||
|
self.docker_compose_file = os.path.join(__parent_dir__,
|
||||||
|
"compose", "ozoneblockade",
|
||||||
|
"docker-compose.yaml")
|
||||||
|
self._datanode_count = 3
|
||||||
|
os.environ["DOCKER_COMPOSE_FILE"] = self.docker_compose_file
|
||||||
|
|
||||||
|
@property
|
||||||
|
def datanode_count(self):
|
||||||
|
return self._datanode_count
|
||||||
|
|
||||||
|
@datanode_count.setter
|
||||||
|
def datanode_count(self, datanode_count):
|
||||||
|
self._datanode_count = datanode_count
|
||||||
|
|
||||||
|
@datanode_count.deleter
|
||||||
|
def datanode_count(self):
|
||||||
|
del self._datanode_count
|
||||||
|
|
||||||
|
|
||||||
|
class Cluster(object):
|
||||||
|
"""
|
||||||
|
This represents Ozone Cluster.
|
||||||
|
Here @property decorators is used to achieve getters, setters and delete
|
||||||
|
behaviour for 'om', 'scm', 'datanodes' and 'clients' attributes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__logger__ = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def __init__(self, conf):
|
||||||
|
self.conf = conf
|
||||||
|
self.docker_compose_file = conf.docker_compose_file
|
||||||
|
self._om = None
|
||||||
|
self._scm = None
|
||||||
|
self._datanodes = None
|
||||||
|
self._clients = None
|
||||||
|
self.scm_uuid = None
|
||||||
|
self.datanode_dir = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def om(self):
|
||||||
|
return self._om
|
||||||
|
|
||||||
|
@om.setter
|
||||||
|
def om(self, om):
|
||||||
|
self._om = om
|
||||||
|
|
||||||
|
@om.deleter
|
||||||
|
def om(self):
|
||||||
|
del self._om
|
||||||
|
|
||||||
|
@property
|
||||||
|
def scm(self):
|
||||||
|
return self._scm
|
||||||
|
|
||||||
|
@scm.setter
|
||||||
|
def scm(self, scm):
|
||||||
|
self._scm = scm
|
||||||
|
|
||||||
|
@scm.deleter
|
||||||
|
def scm(self):
|
||||||
|
del self._scm
|
||||||
|
|
||||||
|
@property
|
||||||
|
def datanodes(self):
|
||||||
|
return self._datanodes
|
||||||
|
|
||||||
|
@datanodes.setter
|
||||||
|
def datanodes(self, datanodes):
|
||||||
|
self._datanodes = datanodes
|
||||||
|
|
||||||
|
@datanodes.deleter
|
||||||
|
def datanodes(self):
|
||||||
|
del self._datanodes
|
||||||
|
|
||||||
|
@property
|
||||||
|
def clients(self):
|
||||||
|
return self._clients
|
||||||
|
|
||||||
|
@clients.setter
|
||||||
|
def clients(self, clients):
|
||||||
|
self._clients = clients
|
||||||
|
|
||||||
|
@clients.deleter
|
||||||
|
def clients(self):
|
||||||
|
del self._clients
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create(cls, config=Configuration()):
|
||||||
|
return Cluster(config)
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
"""
|
||||||
|
Start Ozone Cluster in docker containers.
|
||||||
|
"""
|
||||||
|
Cluster.__logger__.info("Starting Ozone Cluster")
|
||||||
|
Blockade.blockade_destroy()
|
||||||
|
call([Command.docker_compose, "-f", self.docker_compose_file,
|
||||||
|
"up", "-d", "--scale",
|
||||||
|
"datanode=" + str(self.conf.datanode_count)])
|
||||||
|
Cluster.__logger__.info("Waiting 10s for cluster start up...")
|
||||||
|
# Remove the sleep and wait only till the cluster is out of safemode
|
||||||
|
# time.sleep(10)
|
||||||
|
output = subprocess.check_output([Command.docker_compose, "-f",
|
||||||
|
self.docker_compose_file, "ps"])
|
||||||
|
node_list = []
|
||||||
|
for out in output.split("\n")[2:-1]:
|
||||||
|
node = out.split(" ")[0]
|
||||||
|
node_list.append(node)
|
||||||
|
Blockade.blockade_add(node)
|
||||||
|
|
||||||
|
Blockade.blockade_status()
|
||||||
|
self.om = filter(lambda x: 'om' in x, node_list)[0]
|
||||||
|
self.scm = filter(lambda x: 'scm' in x, node_list)[0]
|
||||||
|
self.datanodes = sorted(list(filter(lambda x: 'datanode' in x, node_list)))
|
||||||
|
self.clients = filter(lambda x: 'ozone_client' in x, node_list)
|
||||||
|
self.scm_uuid = self.__get_scm_uuid__()
|
||||||
|
self.datanode_dir = self.get_conf_value("hdds.datanode.dir")
|
||||||
|
|
||||||
|
assert node_list, "no node found in the cluster!"
|
||||||
|
Cluster.__logger__.info("blockade created with nodes %s",
|
||||||
|
' '.join(node_list))
|
||||||
|
|
||||||
|
def get_conf_value(self, key):
|
||||||
|
"""
|
||||||
|
Returns the value of given configuration key.
|
||||||
|
"""
|
||||||
|
command = [Command.ozone, "getconf -confKey " + key]
|
||||||
|
exit_code, output = self.__run_docker_command__(command, self.om)
|
||||||
|
return str(output).strip()
|
||||||
|
|
||||||
|
def scale_datanode(self, datanode_count):
|
||||||
|
"""
|
||||||
|
Commission new datanodes to the running cluster.
|
||||||
|
"""
|
||||||
|
call([Command.docker_compose, "-f", self.docker_compose_file,
|
||||||
|
"up", "-d", "--scale", "datanode=" + datanode_count])
|
||||||
|
|
||||||
|
def partition_network(self, *args):
|
||||||
|
"""
|
||||||
|
Partition the network which is used by the cluster.
|
||||||
|
"""
|
||||||
|
Blockade.blockade_create_partition(*args)
|
||||||
|
|
||||||
|
|
||||||
|
def restore_network(self):
|
||||||
|
"""
|
||||||
|
Restores the network partition.
|
||||||
|
"""
|
||||||
|
Blockade.blockade_join()
|
||||||
|
|
||||||
|
|
||||||
|
def __get_scm_uuid__(self):
|
||||||
|
"""
|
||||||
|
Returns SCM's UUID.
|
||||||
|
"""
|
||||||
|
ozone_metadata_dir = self.get_conf_value("ozone.metadata.dirs")
|
||||||
|
command = "cat %s/scm/current/VERSION" % ozone_metadata_dir
|
||||||
|
exit_code, output = self.__run_docker_command__(command, self.scm)
|
||||||
|
output_list = output.split("\n")
|
||||||
|
key_value = [x for x in output_list if re.search(r"\w+=\w+", x)]
|
||||||
|
uuid = [token for token in key_value if 'scmUuid' in token]
|
||||||
|
return uuid.pop().split("=")[1].strip()
|
||||||
|
|
||||||
|
def get_container_states(self, datanode):
|
||||||
|
"""
|
||||||
|
Returns the state of all the containers in the given datanode.
|
||||||
|
"""
|
||||||
|
container_parent_path = "%s/hdds/%s/current/containerDir0" % \
|
||||||
|
(self.datanode_dir, self.scm_uuid)
|
||||||
|
command = "find %s -type f -name '*.container'" % container_parent_path
|
||||||
|
exit_code, output = self.__run_docker_command__(command, datanode)
|
||||||
|
container_state = {}
|
||||||
|
|
||||||
|
container_list = map(str.strip, output.split("\n"))
|
||||||
|
for container_path in container_list:
|
||||||
|
# Reading the container file.
|
||||||
|
exit_code, output = self.__run_docker_command__(
|
||||||
|
"cat " + container_path, datanode)
|
||||||
|
if exit_code is not 0:
|
||||||
|
continue
|
||||||
|
data = output.split("\n")
|
||||||
|
# Reading key value pairs from container file.
|
||||||
|
key_value = [x for x in data if re.search(r"\w+:\s\w+", x)]
|
||||||
|
content = "\n".join(key_value)
|
||||||
|
content_yaml = yaml.load(content)
|
||||||
|
if content_yaml is None:
|
||||||
|
continue
|
||||||
|
for key, value in content_yaml.items():
|
||||||
|
content_yaml[key] = str(value).lstrip()
|
||||||
|
# Stores the container state in a dictionary.
|
||||||
|
container_state[content_yaml['containerID']] = content_yaml['state']
|
||||||
|
return container_state
|
||||||
|
|
||||||
|
def run_freon(self, num_volumes, num_buckets, num_keys, key_size,
|
||||||
|
replication_type="RATIS", replication_factor="THREE",
|
||||||
|
run_on=None):
|
||||||
|
"""
|
||||||
|
Runs freon on the cluster.
|
||||||
|
"""
|
||||||
|
if run_on is None:
|
||||||
|
run_on = self.om
|
||||||
|
command = [Command.freon,
|
||||||
|
" rk",
|
||||||
|
" --numOfVolumes " + str(num_volumes),
|
||||||
|
" --numOfBuckets " + str(num_buckets),
|
||||||
|
" --numOfKeys " + str(num_keys),
|
||||||
|
" --keySize " + str(key_size),
|
||||||
|
" --replicationType " + replication_type,
|
||||||
|
" --factor " + replication_factor]
|
||||||
|
return self.__run_docker_command__(command, run_on)
|
||||||
|
|
||||||
|
def __run_docker_command__(self, command, run_on):
|
||||||
|
if isinstance(command, list):
|
||||||
|
command = ' '.join(command)
|
||||||
|
command = [Command.docker,
|
||||||
|
"exec " + run_on,
|
||||||
|
command]
|
||||||
|
return util.run_cmd(command)
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
"""
|
||||||
|
Stops the Ozone Cluster.
|
||||||
|
"""
|
||||||
|
Cluster.__logger__.info("Stopping Ozone Cluster")
|
||||||
|
call([Command.docker_compose, "-f", self.docker_compose_file, "down"])
|
||||||
|
Blockade.blockade_destroy()
|
||||||
|
|
||||||
|
def container_state_predicate_all_closed(self, datanodes):
|
||||||
|
for datanode in datanodes:
|
||||||
|
container_states_dn = self.get_container_states(datanode)
|
||||||
|
if not container_states_dn \
|
||||||
|
or container_states_dn.popitem()[1] != 'CLOSED':
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def container_state_predicate_one_closed(self, datanodes):
|
||||||
|
for datanode in datanodes:
|
||||||
|
container_states_dn = self.get_container_states(datanode)
|
||||||
|
if container_states_dn and container_states_dn.popitem()[1] == 'CLOSED':
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def container_state_predicate(self, datanode, state):
|
||||||
|
container_states_dn = self.get_container_states(datanode)
|
||||||
|
if container_states_dn and container_states_dn.popitem()[1] == state:
|
||||||
|
return True
|
||||||
|
return False
|
@ -16,132 +16,123 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
import re
|
|
||||||
import logging
|
import logging
|
||||||
from blockadeUtils.blockade import Blockade
|
import util
|
||||||
from clusterUtils.cluster_utils import ClusterUtils
|
from ozone.cluster import Cluster
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
|
|
||||||
FILE = os.path.join(parent_dir, "compose", "ozoneblockade",
|
|
||||||
"docker-compose.yaml")
|
|
||||||
os.environ["DOCKER_COMPOSE_FILE"] = FILE
|
|
||||||
SCALE = 3
|
|
||||||
INCREASED_SCALE = 5
|
|
||||||
CONTAINER_LIST = []
|
|
||||||
OM = []
|
|
||||||
SCM = []
|
|
||||||
DATANODES = []
|
|
||||||
|
|
||||||
|
|
||||||
def setup():
|
def setup_function(function):
|
||||||
global CONTAINER_LIST, OM, SCM, DATANODES
|
global cluster
|
||||||
Blockade.blockade_destroy()
|
cluster = Cluster.create()
|
||||||
CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE)
|
cluster.start()
|
||||||
exit_code, output = Blockade.blockade_status()
|
|
||||||
assert exit_code == 0, "blockade status command failed with output=[%s]" % \
|
|
||||||
output
|
|
||||||
OM, SCM, _, DATANODES = \
|
|
||||||
ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST)
|
|
||||||
|
|
||||||
exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS",
|
|
||||||
"THREE")
|
|
||||||
assert exit_code == 0, "freon run failed with output=[%s]" % output
|
|
||||||
|
|
||||||
|
|
||||||
def teardown():
|
def teardown_function(function):
|
||||||
logger.info("Inside teardown")
|
cluster.stop()
|
||||||
Blockade.blockade_destroy()
|
|
||||||
|
|
||||||
|
|
||||||
def teardown_module():
|
def test_isolate_single_datanode():
|
||||||
ClusterUtils.cluster_destroy(FILE)
|
|
||||||
|
|
||||||
|
|
||||||
def test_isolatedatanode_singlenode(run_second_phase):
|
|
||||||
"""
|
"""
|
||||||
In this test, one of the datanodes (first datanode) cannot communicate
|
In this test case we will create a network partition in such a way that
|
||||||
with other two datanodes.
|
one of the datanode will not be able to communicate with other datanodes
|
||||||
All datanodes can communicate with SCM.
|
but it will be able to communicate with SCM.
|
||||||
Expectation :
|
|
||||||
The container replica state in first datanode should be quasi-closed.
|
|
||||||
The container replica state in other datanodes should be closed.
|
|
||||||
"""
|
|
||||||
first_set = [OM[0], SCM[0], DATANODES[0]]
|
|
||||||
second_set = [OM[0], SCM[0], DATANODES[1], DATANODES[2]]
|
|
||||||
Blockade.blockade_create_partition(first_set, second_set)
|
|
||||||
Blockade.blockade_status()
|
|
||||||
ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
|
|
||||||
logger.info("Waiting for %s seconds before checking container status",
|
|
||||||
os.environ["CONTAINER_STATUS_SLEEP"])
|
|
||||||
time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
|
|
||||||
all_datanodes_container_status = \
|
|
||||||
ClusterUtils.findall_container_status(FILE, SCALE)
|
|
||||||
first_datanode_status = all_datanodes_container_status[0]
|
|
||||||
closed_container_datanodes = [x for x in all_datanodes_container_status
|
|
||||||
if x == 'CLOSED']
|
|
||||||
assert first_datanode_status == 'QUASI_CLOSED'
|
|
||||||
assert len(closed_container_datanodes) == 2, \
|
|
||||||
"The container should have two closed replicas."
|
|
||||||
|
|
||||||
if str(run_second_phase).lower() == "true":
|
Once the network partition happens, SCM detects it and closes the pipeline,
|
||||||
ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False)
|
which in-turn closes the containers.
|
||||||
Blockade.blockade_status()
|
|
||||||
logger.info("Waiting for %s seconds before checking container status",
|
|
||||||
os.environ["CONTAINER_STATUS_SLEEP"])
|
|
||||||
time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
|
|
||||||
all_datanodes_container_status = \
|
|
||||||
ClusterUtils.findall_container_status(FILE, INCREASED_SCALE)
|
|
||||||
closed_container_datanodes = [x for x in all_datanodes_container_status
|
|
||||||
if x == 'CLOSED']
|
|
||||||
assert len(closed_container_datanodes) >= 3, \
|
|
||||||
"The container should have at least three closed replicas."
|
|
||||||
Blockade.blockade_join()
|
|
||||||
Blockade.blockade_status()
|
|
||||||
_, output = \
|
|
||||||
ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
|
|
||||||
assert re.search("Status: Success", output) is not None
|
|
||||||
|
|
||||||
|
The container on the first two datanode will get CLOSED as they have quorum.
|
||||||
|
The container replica on the third node will be QUASI_CLOSED as it is not
|
||||||
|
able to connect with the other datanodes and it doesn't have latest BCSID.
|
||||||
|
|
||||||
def test_datanode_isolation_all(run_second_phase):
|
Once we restore the network, the stale replica on the third datanode will be
|
||||||
"""
|
deleted and a latest replica will be copied from any one of the other
|
||||||
In this test, none of the datanodes can communicate with other two
|
|
||||||
datanodes.
|
datanodes.
|
||||||
All datanodes can communicate with SCM.
|
|
||||||
Expectation : The container should eventually have at least two closed
|
|
||||||
replicas.
|
|
||||||
"""
|
|
||||||
first_set = [OM[0], SCM[0], DATANODES[0]]
|
|
||||||
second_set = [OM[0], SCM[0], DATANODES[1]]
|
|
||||||
third_set = [OM[0], SCM[0], DATANODES[2]]
|
|
||||||
Blockade.blockade_create_partition(first_set, second_set, third_set)
|
|
||||||
Blockade.blockade_status()
|
|
||||||
ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
|
|
||||||
logger.info("Waiting for %s seconds before checking container status",
|
|
||||||
os.environ["CONTAINER_STATUS_SLEEP"])
|
|
||||||
time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
|
|
||||||
all_datanodes_container_status = \
|
|
||||||
ClusterUtils.findall_container_status(FILE, SCALE)
|
|
||||||
closed_container_datanodes = [x for x in all_datanodes_container_status
|
|
||||||
if x == 'CLOSED']
|
|
||||||
assert len(closed_container_datanodes) >= 2, \
|
|
||||||
"The container should have at least two closed replicas."
|
|
||||||
|
|
||||||
if str(run_second_phase).lower() == "true":
|
"""
|
||||||
ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False)
|
cluster.run_freon(1, 1, 1, 10240)
|
||||||
Blockade.blockade_status()
|
first_set = [cluster.om, cluster.scm,
|
||||||
logger.info("Waiting for %s seconds before checking container status",
|
cluster.datanodes[0], cluster.datanodes[1]]
|
||||||
os.environ["CONTAINER_STATUS_SLEEP"])
|
second_set = [cluster.om, cluster.scm, cluster.datanodes[2]]
|
||||||
time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
|
logger.info("Partitioning the network")
|
||||||
all_datanodes_container_status = \
|
cluster.partition_network(first_set, second_set)
|
||||||
ClusterUtils.findall_container_status(FILE, INCREASED_SCALE)
|
cluster.run_freon(1, 1, 1, 10240)
|
||||||
closed_container_datanodes = [x for x in all_datanodes_container_status
|
logger.info("Waiting for container to be QUASI_CLOSED")
|
||||||
if x == 'CLOSED']
|
|
||||||
assert len(closed_container_datanodes) >= 3, \
|
util.wait_until(lambda: cluster.get_container_states(cluster.datanodes[2])
|
||||||
"The container should have at least three closed replicas."
|
.popitem()[1] == 'QUASI_CLOSED',
|
||||||
Blockade.blockade_join()
|
int(os.environ["CONTAINER_STATUS_SLEEP"]), 10)
|
||||||
Blockade.blockade_status()
|
container_states_dn_0 = cluster.get_container_states(cluster.datanodes[0])
|
||||||
_, output = \
|
container_states_dn_1 = cluster.get_container_states(cluster.datanodes[1])
|
||||||
ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
|
container_states_dn_2 = cluster.get_container_states(cluster.datanodes[2])
|
||||||
assert re.search("Status: Success", output) is not None
|
assert len(container_states_dn_0) != 0
|
||||||
|
assert len(container_states_dn_1) != 0
|
||||||
|
assert len(container_states_dn_2) != 0
|
||||||
|
for key in container_states_dn_0:
|
||||||
|
assert container_states_dn_0.get(key) == 'CLOSED'
|
||||||
|
for key in container_states_dn_1:
|
||||||
|
assert container_states_dn_1.get(key) == 'CLOSED'
|
||||||
|
for key in container_states_dn_2:
|
||||||
|
assert container_states_dn_2.get(key) == 'QUASI_CLOSED'
|
||||||
|
|
||||||
|
# Since the replica in datanode[2] doesn't have the latest BCSID,
|
||||||
|
# ReplicationManager will delete it and copy a closed replica.
|
||||||
|
# We will now restore the network and datanode[2] should get a
|
||||||
|
# closed replica of the container
|
||||||
|
logger.info("Restoring the network")
|
||||||
|
cluster.restore_network()
|
||||||
|
|
||||||
|
logger.info("Waiting for the replica to be CLOSED")
|
||||||
|
util.wait_until(
|
||||||
|
lambda: cluster.container_state_predicate(cluster.datanodes[2], 'CLOSED'),
|
||||||
|
int(os.environ["CONTAINER_STATUS_SLEEP"]), 10)
|
||||||
|
container_states_dn_2 = cluster.get_container_states(cluster.datanodes[2])
|
||||||
|
assert len(container_states_dn_2) != 0
|
||||||
|
for key in container_states_dn_2:
|
||||||
|
assert container_states_dn_2.get(key) == 'CLOSED'
|
||||||
|
|
||||||
|
|
||||||
|
def test_datanode_isolation_all():
|
||||||
|
"""
|
||||||
|
In this test case we will create a network partition in such a way that
|
||||||
|
all datanodes cannot communicate with each other.
|
||||||
|
All datanodes will be able to communicate with SCM.
|
||||||
|
|
||||||
|
Once the network partition happens, SCM detects it and closes the pipeline,
|
||||||
|
which in-turn tries to close the containers.
|
||||||
|
At least one of the replica should be in closed state
|
||||||
|
|
||||||
|
Once we restore the network, there will be three closed replicas.
|
||||||
|
|
||||||
|
"""
|
||||||
|
cluster.run_freon(1, 1, 1, 10240)
|
||||||
|
|
||||||
|
assert len(cluster.get_container_states(cluster.datanodes[0])) != 0
|
||||||
|
assert len(cluster.get_container_states(cluster.datanodes[1])) != 0
|
||||||
|
assert len(cluster.get_container_states(cluster.datanodes[2])) != 0
|
||||||
|
|
||||||
|
logger.info("Partitioning the network")
|
||||||
|
first_set = [cluster.om, cluster.scm, cluster.datanodes[0]]
|
||||||
|
second_set = [cluster.om, cluster.scm, cluster.datanodes[1]]
|
||||||
|
third_set = [cluster.om, cluster.scm, cluster.datanodes[2]]
|
||||||
|
cluster.partition_network(first_set, second_set, third_set)
|
||||||
|
|
||||||
|
logger.info("Waiting for the replica to be CLOSED")
|
||||||
|
util.wait_until(
|
||||||
|
lambda: cluster.container_state_predicate_one_closed(cluster.datanodes),
|
||||||
|
int(os.environ["CONTAINER_STATUS_SLEEP"]), 10)
|
||||||
|
|
||||||
|
# At least one of the replica should be in closed state
|
||||||
|
assert cluster.container_state_predicate_one_closed(cluster.datanodes)
|
||||||
|
|
||||||
|
# After restoring the network all the replicas should be in
|
||||||
|
# CLOSED state
|
||||||
|
logger.info("Restoring the network")
|
||||||
|
cluster.restore_network()
|
||||||
|
|
||||||
|
logger.info("Waiting for the container to be replicated")
|
||||||
|
util.wait_until(
|
||||||
|
lambda: cluster.container_state_predicate_all_closed(cluster.datanodes),
|
||||||
|
int(os.environ["CONTAINER_STATUS_SLEEP"]), 10)
|
||||||
|
assert cluster.container_state_predicate_all_closed(cluster.datanodes)
|
@ -16,11 +16,11 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
import logging
|
import logging
|
||||||
|
import random
|
||||||
import pytest
|
import pytest
|
||||||
from blockadeUtils.blockade import Blockade
|
from blockadeUtils.blockade import Blockade
|
||||||
from clusterUtils.cluster_utils import ClusterUtils
|
from ozone.cluster import Cluster
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -32,30 +32,36 @@
|
|||||||
CONTAINER_LIST = []
|
CONTAINER_LIST = []
|
||||||
|
|
||||||
|
|
||||||
def setup_module():
|
def setup_function(function):
|
||||||
global CONTAINER_LIST
|
global cluster
|
||||||
Blockade.blockade_destroy()
|
cluster = Cluster.create()
|
||||||
CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE)
|
cluster.start()
|
||||||
exit_code, output = Blockade.blockade_status()
|
|
||||||
assert exit_code == 0, "blockade status command failed with output=[%s]" % \
|
|
||||||
output
|
|
||||||
|
|
||||||
|
|
||||||
def teardown_module():
|
def teardown_function(function):
|
||||||
Blockade.blockade_destroy()
|
cluster.stop()
|
||||||
ClusterUtils.cluster_destroy(FILE)
|
|
||||||
|
|
||||||
|
|
||||||
def teardown():
|
@pytest.mark.parametrize("flaky_node", ["datanode", "scm", "om", "all"])
|
||||||
logger.info("Inside teardown")
|
def test_flaky(flaky_node):
|
||||||
Blockade.blockade_fast_all()
|
"""
|
||||||
time.sleep(5)
|
In these tests, we make the network of the nodes as flaky using blockade.
|
||||||
|
There are 4 tests :
|
||||||
|
1) one of the datanodes selected randomly and network of the datanode is
|
||||||
|
made flaky.
|
||||||
|
2) scm network is made flaky.
|
||||||
|
3) om network is made flaky.
|
||||||
|
4) Network of all the nodes are made flaky.
|
||||||
|
|
||||||
|
"""
|
||||||
|
flaky_container_name = {
|
||||||
|
"scm": cluster.scm,
|
||||||
|
"om": cluster.om,
|
||||||
|
"datanode": random.choice(cluster.datanodes),
|
||||||
|
"all": "--all"
|
||||||
|
}[flaky_node]
|
||||||
|
|
||||||
@pytest.mark.parametrize("flaky_nodes", ["datanode", "scm", "om", "all"])
|
Blockade.make_flaky(flaky_container_name)
|
||||||
def test_flaky(flaky_nodes):
|
|
||||||
Blockade.make_flaky(flaky_nodes, CONTAINER_LIST)
|
|
||||||
Blockade.blockade_status()
|
Blockade.blockade_status()
|
||||||
exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS",
|
exit_code, output = cluster.run_freon(1, 1, 1, 10240)
|
||||||
"THREE")
|
|
||||||
assert exit_code == 0, "freon run failed with output=[%s]" % output
|
assert exit_code == 0, "freon run failed with output=[%s]" % output
|
52
hadoop-ozone/dist/src/main/blockade/util.py
vendored
Normal file
52
hadoop-ozone/dist/src/main/blockade/util.py
vendored
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def wait_until(predicate, timeout, check_frequency=1):
|
||||||
|
deadline = time.time() + timeout
|
||||||
|
while time.time() < deadline:
|
||||||
|
if predicate():
|
||||||
|
return
|
||||||
|
time.sleep(check_frequency)
|
||||||
|
|
||||||
|
|
||||||
|
def run_cmd(cmd):
|
||||||
|
command = cmd
|
||||||
|
if isinstance(cmd, list):
|
||||||
|
command = ' '.join(cmd)
|
||||||
|
logger.info(" RUNNING: %s", command)
|
||||||
|
all_output = ""
|
||||||
|
my_process = subprocess.Popen(command, stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT, shell=True)
|
||||||
|
while my_process.poll() is None:
|
||||||
|
op = my_process.stdout.readline()
|
||||||
|
if op:
|
||||||
|
all_output += op
|
||||||
|
logger.info(op)
|
||||||
|
other_output = my_process.communicate()
|
||||||
|
other_output = other_output[0].strip()
|
||||||
|
if other_output != "":
|
||||||
|
all_output += other_output
|
||||||
|
reg = re.compile(r"(\r\n|\n)$")
|
||||||
|
all_output = reg.sub("", all_output, 1)
|
||||||
|
return my_process.returncode, all_output
|
@ -26,6 +26,12 @@ OZONE-SITE.XML_ozone.scm.client.address=scm
|
|||||||
OZONE-SITE.XML_ozone.scm.dead.node.interval=5m
|
OZONE-SITE.XML_ozone.scm.dead.node.interval=5m
|
||||||
OZONE-SITE.XML_ozone.replication=1
|
OZONE-SITE.XML_ozone.replication=1
|
||||||
OZONE-SITE.XML_hdds.datanode.dir=/data/hdds
|
OZONE-SITE.XML_hdds.datanode.dir=/data/hdds
|
||||||
|
OZONE-SITE.XML_ozone.scm.pipeline.owner.container.count=1
|
||||||
|
OZONE-SITE.XML_ozone.scm.pipeline.destroy.timeout=15s
|
||||||
|
OZONE-SITE.XML_hdds.heartbeat.interval=2s
|
||||||
|
OZONE-SITE.XML_hdds.scm.replication.thread.interval=5s
|
||||||
|
OZONE-SITE.XML_hdds.scm.replication.event.timeout=7s
|
||||||
|
OZONE-SITE.XML_dfs.ratis.server.failure.duration=25s
|
||||||
HDFS-SITE.XML_rpc.metrics.quantile.enable=true
|
HDFS-SITE.XML_rpc.metrics.quantile.enable=true
|
||||||
HDFS-SITE.XML_rpc.metrics.percentiles.intervals=60,300
|
HDFS-SITE.XML_rpc.metrics.percentiles.intervals=60,300
|
||||||
LOG4J.PROPERTIES_log4j.rootLogger=INFO, stdout
|
LOG4J.PROPERTIES_log4j.rootLogger=INFO, stdout
|
||||||
|
Loading…
Reference in New Issue
Block a user