From 7cd7045eea24ee0001f9069be370a028c64e36a6 Mon Sep 17 00:00:00 2001 From: Shashikant Banerjee Date: Wed, 3 Apr 2019 22:19:10 +0530 Subject: [PATCH] HDDS-1164. Add New blockade Tests to test Replica Manager. Contributed by Nilotpal Nandi. --- .../main/blockade/blockadeUtils/blockade.py | 5 +- .../blockade/clusterUtils/cluster_utils.py | 23 +- .../blockade/test_blockade_client_failure.py | 9 +- .../test_blockade_datanode_isolation.py | 5 +- .../blockade/test_blockade_mixed_failure.py | 152 ++++++----- ...ckade_mixed_failure_three_nodes_isolate.py | 245 ++++++++++++------ .../test_blockade_mixed_failure_two_nodes.py | 184 ++++++++----- .../blockade/test_blockade_scm_isolation.py | 5 +- 8 files changed, 403 insertions(+), 225 deletions(-) diff --git a/hadoop-ozone/dist/src/main/blockade/blockadeUtils/blockade.py b/hadoop-ozone/dist/src/main/blockade/blockadeUtils/blockade.py index c3d1bbb217..f371865a05 100644 --- a/hadoop-ozone/dist/src/main/blockade/blockadeUtils/blockade.py +++ b/hadoop-ozone/dist/src/main/blockade/blockadeUtils/blockade.py @@ -45,9 +45,8 @@ def blockade_status(cls): @classmethod def make_flaky(cls, flaky_node, container_list): # make the network flaky - om = filter(lambda x: 'ozoneManager' in x, container_list) - scm = filter(lambda x: 'scm' in x, container_list) - datanodes = filter(lambda x: 'datanode' in x, container_list) + om, scm, _, datanodes = \ + ClusterUtils.find_om_scm_client_datanodes(container_list) node_dict = { "all": "--all", "scm" : scm[0], diff --git a/hadoop-ozone/dist/src/main/blockade/clusterUtils/cluster_utils.py b/hadoop-ozone/dist/src/main/blockade/clusterUtils/cluster_utils.py index baa396093f..3a04103d95 100644 --- a/hadoop-ozone/dist/src/main/blockade/clusterUtils/cluster_utils.py +++ b/hadoop-ozone/dist/src/main/blockade/clusterUtils/cluster_utils.py @@ -71,7 +71,7 @@ def cluster_destroy(cls, docker_compose_file): @classmethod def run_freon(cls, docker_compose_file, num_volumes, num_buckets, num_keys, key_size, replication_type, replication_factor, - freon_client='ozoneManager'): + freon_client='om'): # run freon cmd = "docker-compose -f %s " \ "exec %s /opt/hadoop/bin/ozone " \ @@ -115,7 +115,7 @@ def run_cmd(cls, cmd): @classmethod def get_ozone_confkey_value(cls, docker_compose_file, key_name): cmd = "docker-compose -f %s " \ - "exec ozoneManager /opt/hadoop/bin/ozone " \ + "exec om /opt/hadoop/bin/ozone " \ "getconf -confKey %s" \ % (docker_compose_file, key_name) exit_code, output = cls.run_cmd(cmd) @@ -307,3 +307,22 @@ def find_checksum(cls, docker_compose_file, filepath): checksum = finaloutput.split(" ") logger.info("Checksum of %s is : %s", filepath, checksum[0]) return checksum[0] + + @classmethod + def get_pipelines(cls, docker_compose_file): + command = "docker-compose -f %s " \ + + "exec ozone_client /opt/hadoop/bin/ozone scmcli " \ + + "listPipelines" % (docker_compose_file) + exit_code, output = cls.run_cmd(command) + assert exit_code == 0, "list pipeline command failed" + return output + + @classmethod + def find_om_scm_client_datanodes(cls, container_list): + + om = filter(lambda x: 'om_1' in x, container_list) + scm = filter(lambda x: 'scm' in x, container_list) + datanodes = sorted( + list(filter(lambda x: 'datanode' in x, container_list))) + client = filter(lambda x: 'ozone_client' in x, container_list) + return om, scm, client, datanodes \ No newline at end of file diff --git a/hadoop-ozone/dist/src/main/blockade/test_blockade_client_failure.py b/hadoop-ozone/dist/src/main/blockade/test_blockade_client_failure.py index c9e4ae51ba..8c0b518493 100644 --- a/hadoop-ozone/dist/src/main/blockade/test_blockade_client_failure.py +++ b/hadoop-ozone/dist/src/main/blockade/test_blockade_client_failure.py @@ -47,11 +47,8 @@ def setup(): exit_code, output = Blockade.blockade_status() assert exit_code == 0, "blockade status command failed with output=[%s]" % \ output - OM = filter(lambda x: 'ozoneManager' in x, CONTAINER_LIST) - SCM = filter(lambda x: 'scm' in x, CONTAINER_LIST) - DATANODES = sorted(list(filter(lambda x: 'datanode' in x, CONTAINER_LIST))) - CLIENT = filter(lambda x: 'ozone_client' in x, CONTAINER_LIST) - + OM, SCM, CLIENT, DATANODES = \ + ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST) exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE", "ozone_client") assert exit_code == 0, "freon run failed with output=[%s]" % output @@ -121,4 +118,4 @@ def test_client_failure_isolate_one_datanode(): test_key_name, "/tmp/") key_checksum = ClusterUtils.find_checksum(FILE, "/tmp/%s" % test_key_name) - assert key_checksum == ORIG_CHECKSUM + assert key_checksum == ORIG_CHECKSUM \ No newline at end of file diff --git a/hadoop-ozone/dist/src/main/blockade/test_blockade_datanode_isolation.py b/hadoop-ozone/dist/src/main/blockade/test_blockade_datanode_isolation.py index fecd9d1009..1e53a32a5f 100644 --- a/hadoop-ozone/dist/src/main/blockade/test_blockade_datanode_isolation.py +++ b/hadoop-ozone/dist/src/main/blockade/test_blockade_datanode_isolation.py @@ -42,9 +42,8 @@ def setup(): exit_code, output = Blockade.blockade_status() assert exit_code == 0, "blockade status command failed with output=[%s]" % \ output - OM = [x for x in CONTAINER_LIST if 'ozoneManager' in x] - SCM = [x for x in CONTAINER_LIST if 'scm' in x] - DATANODES = sorted(x for x in CONTAINER_LIST if 'datanode' in x) + OM, SCM, _, DATANODES = \ + ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST) exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") diff --git a/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure.py b/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure.py index 59755e0156..8493ce0272 100644 --- a/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure.py +++ b/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure.py @@ -18,16 +18,17 @@ import os import time import logging +import re from blockadeUtils.blockade import Blockade from clusterUtils.cluster_utils import ClusterUtils - logger = logging.getLogger(__name__) parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) FILE = os.path.join(parent_dir, "compose", "ozoneblockade", "docker-compose.yaml") os.environ["DOCKER_COMPOSE_FILE"] = FILE SCALE = 3 +INCREASED_SCALE = 5 CONTAINER_LIST = [] OM = [] SCM = [] @@ -35,83 +36,114 @@ def setup(): - global CONTAINER_LIST, OM, SCM, DATANODES - Blockade.blockade_destroy() - CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE) - exit_code, output = Blockade.blockade_status() - assert exit_code == 0, "blockade status command failed with output=[%s]" % \ - output - OM = filter(lambda x: 'ozoneManager' in x, CONTAINER_LIST) - SCM = filter(lambda x: 'scm' in x, CONTAINER_LIST) - DATANODES = sorted(list(filter(lambda x: 'datanode' in x, CONTAINER_LIST))) + global CONTAINER_LIST, OM, SCM, DATANODES + Blockade.blockade_destroy() + CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE) + exit_code, output = Blockade.blockade_status() + assert exit_code == 0, "blockade status command failed with output=[%s]" % \ + output + OM, SCM, _, DATANODES = \ + ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST) - exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", - "THREE") - assert exit_code == 0, "freon run failed with output=[%s]" % output + exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", + "THREE") + assert exit_code == 0, "freon run failed with output=[%s]" % output def teardown(): - logger.info("Inside teardown") - Blockade.blockade_destroy() + logger.info("Inside teardown") + Blockade.blockade_destroy() def teardown_module(): - ClusterUtils.cluster_destroy(FILE) + ClusterUtils.cluster_destroy(FILE) -def test_one_dn_isolate_scm_other_dn(): - """ - In this test, one of the datanodes cannot communicate with SCM and other - datanodes. - Other datanodes can communicate with each other and SCM . - Expectation : The container should eventually have two closed replicas. - """ - first_set = [OM[0], SCM[0], DATANODES[1], DATANODES[2]] - second_set = [OM[0], DATANODES[0]] - Blockade.blockade_create_partition(first_set, second_set) +def test_one_dn_isolate_scm_other_dn(run_second_phase): + """ + In this test, one of the datanodes cannot communicate with SCM and other + datanodes. + Other datanodes can communicate with each other and SCM . + Expectation : The container should eventually have two closed replicas. + """ + first_set = [OM[0], SCM[0], DATANODES[1], DATANODES[2]] + second_set = [OM[0], DATANODES[0]] + Blockade.blockade_create_partition(first_set, second_set) + Blockade.blockade_status() + ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") + logger.info("Waiting for %s seconds before checking container status", + os.environ["CONTAINER_STATUS_SLEEP"]) + time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) + all_datanodes_container_status = \ + ClusterUtils.findall_container_status(FILE, SCALE) + count_closed_container_datanodes = filter(lambda x: x == 'CLOSED', + all_datanodes_container_status) + assert len(count_closed_container_datanodes) == 2, \ + "The container should have two closed replicas." + if str(run_second_phase).lower() == "true": + ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False) Blockade.blockade_status() - ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ - ClusterUtils.find_all_datanodes_container_status(FILE, SCALE) - count_closed_container_datanodes = filter(lambda x: x == 'CLOSED', - all_datanodes_container_status) - assert len(count_closed_container_datanodes) == 2, \ - "The container should have two closed replicas." + ClusterUtils.findall_container_status( + FILE, INCREASED_SCALE) + count_closed_container_datanodes = filter( + lambda x: x == 'CLOSED', all_datanodes_container_status) + assert len(count_closed_container_datanodes) >= 3, \ + "The container should have at least three closed replicas." + _, output = \ + ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") + assert re.search("Status: Success", output) is not None -def test_one_dn_isolate_other_dn(): - """ - In this test, one of the datanodes (first datanode) cannot communicate - other datanodes but can communicate with SCM. - One of the other two datanodes (second datanode) cannot communicate with - SCM. - Expectation : - The container replica state in first datanode can be either closed or - quasi-closed. - The container replica state in second datanode can be either closed or open. - The container should eventually have at lease one closed replica. - """ - first_set = [OM[0], SCM[0], DATANODES[0]] - second_set = [OM[0], DATANODES[1], DATANODES[2]] - third_set = [SCM[0], DATANODES[2]] - Blockade.blockade_create_partition(first_set, second_set, third_set) +def test_one_dn_isolate_other_dn(run_second_phase): + """ + In this test, one of the datanodes (first datanode) cannot communicate + other datanodes but can communicate with SCM. + One of the other two datanodes (second datanode) cannot communicate with + SCM. + Expectation : + The container replica state in first datanode can be either closed or + quasi-closed. + The container replica state in second datanode can be either closed or open. + The container should eventually have at lease one closed replica. + """ + first_set = [OM[0], SCM[0], DATANODES[0]] + second_set = [OM[0], DATANODES[1], DATANODES[2]] + third_set = [SCM[0], DATANODES[2]] + Blockade.blockade_create_partition(first_set, second_set, third_set) + Blockade.blockade_status() + ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") + logger.info("Waiting for %s seconds before checking container status", + os.environ["CONTAINER_STATUS_SLEEP"]) + time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) + all_datanodes_container_status = \ + ClusterUtils.findall_container_status(FILE, SCALE) + count_closed_container_datanodes = filter(lambda x: x == 'CLOSED', + all_datanodes_container_status) + first_datanode_status = all_datanodes_container_status[0] + second_datanode_status = all_datanodes_container_status[1] + assert first_datanode_status == 'CLOSED' or \ + first_datanode_status == "QUASI_CLOSED" + assert second_datanode_status == 'CLOSED' or \ + second_datanode_status == "OPEN" + assert len(count_closed_container_datanodes) >= 1, \ + "The container should have at least one closed replica" + if str(run_second_phase).lower() == "true": + ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False) Blockade.blockade_status() - ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ - ClusterUtils.find_all_datanodes_container_status(FILE, SCALE) - count_closed_container_datanodes = filter(lambda x: x == 'CLOSED', - all_datanodes_container_status) - first_datanode_status = all_datanodes_container_status[0] - second_datanode_status = all_datanodes_container_status[1] - assert first_datanode_status == 'CLOSED' or \ - first_datanode_status == "QUASI_CLOSED" - assert second_datanode_status == 'CLOSED' or \ - second_datanode_status == "OPEN" - assert len(count_closed_container_datanodes) >= 1, \ - "The container should have at least one closed replica" \ No newline at end of file + ClusterUtils.findall_container_status( + FILE, INCREASED_SCALE) + count_closed_container_datanodes = filter( + lambda x: x == 'CLOSED', all_datanodes_container_status) + assert len(count_closed_container_datanodes) >= 3, \ + "The container should have at least three closed replicas." + _, output = \ + ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") + assert re.search("Status: Success", output) is not None \ No newline at end of file diff --git a/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_three_nodes_isolate.py b/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_three_nodes_isolate.py index ee4d031087..0e50025847 100644 --- a/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_three_nodes_isolate.py +++ b/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_three_nodes_isolate.py @@ -18,16 +18,17 @@ import os import time import logging +import re from blockadeUtils.blockade import Blockade from clusterUtils.cluster_utils import ClusterUtils - logger = logging.getLogger(__name__) parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) FILE = os.path.join(parent_dir, "compose", "ozoneblockade", "docker-compose.yaml") os.environ["DOCKER_COMPOSE_FILE"] = FILE SCALE = 3 +INCREASED_SCALE = 5 CONTAINER_LIST = [] OM = [] SCM = [] @@ -35,110 +36,190 @@ def setup(): - global CONTAINER_LIST, OM, SCM, DATANODES - Blockade.blockade_destroy() - CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE) - exit_code, output = Blockade.blockade_status() - assert exit_code == 0, "blockade status command failed with output=[%s]" % \ - output - OM = filter(lambda x: 'ozoneManager' in x, CONTAINER_LIST) - SCM = filter(lambda x: 'scm' in x, CONTAINER_LIST) - DATANODES = sorted(list(filter(lambda x: 'datanode' in x, CONTAINER_LIST))) + global CONTAINER_LIST, OM, SCM, DATANODES + Blockade.blockade_destroy() + CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE) + exit_code, output = Blockade.blockade_status() + assert exit_code == 0, "blockade status command failed with output=[%s]" % \ + output + OM, SCM, _, DATANODES = \ + ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST) - exit_code, output = \ - ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") - assert exit_code == 0, "freon run failed with output=[%s]" % output + exit_code, output = \ + ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") + assert exit_code == 0, "freon run failed with output=[%s]" % output def teardown(): - logger.info("Inside teardown") - Blockade.blockade_destroy() + logger.info("Inside teardown") + Blockade.blockade_destroy() def teardown_module(): - ClusterUtils.cluster_destroy(FILE) + ClusterUtils.cluster_destroy(FILE) -def test_three_dns_isolate_onescmfailure(): - """ - In this test, all datanodes are isolated from each other. - One of the datanodes (third datanode) cannot communicate with SCM. - Expectation : - The container replica state in first datanode should be closed. - The container replica state in second datanode should be closed. - The container replica state in third datanode should be open. - """ - first_set = [OM[0], SCM[0], DATANODES[0]] - second_set = [OM[0], SCM[0], DATANODES[1]] - third_set = [OM[0], DATANODES[2]] - Blockade.blockade_create_partition(first_set, second_set, third_set) +def test_three_dns_isolate_onescmfailure(run_second_phase): + """ + In this test, all datanodes are isolated from each other. + One of the datanodes (third datanode) cannot communicate with SCM. + Expectation : + The container replica state in first datanode should be closed. + The container replica state in second datanode should be closed. + The container replica state in third datanode should be open. + """ + first_set = [OM[0], SCM[0], DATANODES[0]] + second_set = [OM[0], SCM[0], DATANODES[1]] + third_set = [OM[0], DATANODES[2]] + Blockade.blockade_create_partition(first_set, second_set, third_set) + Blockade.blockade_status() + ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") + logger.info("Waiting for %s seconds before checking container status", + os.environ["CONTAINER_STATUS_SLEEP"]) + time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) + all_datanodes_container_status = \ + ClusterUtils.findall_container_status(FILE, SCALE) + first_datanode_status = all_datanodes_container_status[0] + second_datanode_status = all_datanodes_container_status[1] + third_datanode_status = all_datanodes_container_status[2] + assert first_datanode_status == 'CLOSED' + assert second_datanode_status == 'CLOSED' + assert third_datanode_status == 'OPEN' + + if str(run_second_phase).lower() == "true": + ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False) Blockade.blockade_status() - ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ - ClusterUtils.find_all_datanodes_container_status(FILE, SCALE) - first_datanode_status = all_datanodes_container_status[0] - second_datanode_status = all_datanodes_container_status[1] - third_datanode_status = all_datanodes_container_status[2] - assert first_datanode_status == 'CLOSED' - assert second_datanode_status == 'CLOSED' - assert third_datanode_status == 'OPEN' - - -def test_three_dns_isolate_twoscmfailure(): - """ - In this test, all datanodes are isolated from each other. - two datanodes cannot communicate with SCM (second datanode and third - datanode) - Expectation : - The container replica state in first datanode should be quasi-closed. - The container replica state in second datanode should be open. - The container replica state in third datanode should be open. - """ - first_set = [OM[0], SCM[0], DATANODES[0]] - second_set = [OM[0], DATANODES[1]] - third_set = [OM[0], DATANODES[2]] - Blockade.blockade_create_partition(first_set, second_set, third_set) + ClusterUtils.findall_container_status( + FILE, INCREASED_SCALE) + count_closed_container_datanodes = filter( + lambda x: x == 'CLOSED', all_datanodes_container_status) + assert len(count_closed_container_datanodes) == 3, \ + "The container should have three closed replicas." + Blockade.blockade_join() Blockade.blockade_status() - ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ - ClusterUtils.find_all_datanodes_container_status(FILE, SCALE) - first_datanode_status = all_datanodes_container_status[0] - second_datanode_status = all_datanodes_container_status[1] - third_datanode_status = all_datanodes_container_status[2] - assert first_datanode_status == 'QUASI_CLOSED' - assert second_datanode_status == 'OPEN' - assert third_datanode_status == 'OPEN' + ClusterUtils.findall_container_status( + FILE, INCREASED_SCALE) + count_closed_container_datanodes = filter( + lambda x: x == 'CLOSED', all_datanodes_container_status) + assert len(count_closed_container_datanodes) == 3, \ + "The container should have three closed replicas." -def test_three_dns_isolate_threescmfailure(): - """ - In this test, all datanodes are isolated from each other and also cannot - communicate with SCM. - Expectation : - The container replica state in first datanode should be open. - The container replica state in second datanode should be open. - The container replica state in third datanode should be open. - """ - first_set = [OM[0], DATANODES[0]] - second_set = [OM[0], DATANODES[1]] - third_set = [OM[0], DATANODES[2]] - Blockade.blockade_create_partition(first_set, second_set, third_set) +def test_three_dns_isolate_twoscmfailure(run_second_phase): + """ + In this test, all datanodes are isolated from each other. + two datanodes cannot communicate with SCM (second datanode and third + datanode) + Expectation : + The container replica state in first datanode should be quasi-closed. + The container replica state in second datanode should be open. + The container replica state in third datanode should be open. + """ + first_set = [OM[0], SCM[0], DATANODES[0]] + second_set = [OM[0], DATANODES[1]] + third_set = [OM[0], DATANODES[2]] + Blockade.blockade_create_partition(first_set, second_set, third_set) + Blockade.blockade_status() + ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") + logger.info("Waiting for %s seconds before checking container status", + os.environ["CONTAINER_STATUS_SLEEP"]) + time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) + all_datanodes_container_status = \ + ClusterUtils.findall_container_status(FILE, SCALE) + first_datanode_status = all_datanodes_container_status[0] + second_datanode_status = all_datanodes_container_status[1] + third_datanode_status = all_datanodes_container_status[2] + assert first_datanode_status == 'QUASI_CLOSED' + assert second_datanode_status == 'OPEN' + assert third_datanode_status == 'OPEN' + + if str(run_second_phase).lower() == "true": + ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False) Blockade.blockade_status() - ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ - ClusterUtils.find_all_datanodes_container_status(FILE, SCALE) - first_datanode_status = all_datanodes_container_status[0] - second_datanode_status = all_datanodes_container_status[1] - third_datanode_status = all_datanodes_container_status[2] - assert first_datanode_status == 'OPEN' - assert second_datanode_status == 'OPEN' - assert third_datanode_status == 'OPEN' \ No newline at end of file + ClusterUtils.findall_container_status( + FILE, INCREASED_SCALE) + count_quasi_closed_container_datanodes = filter( + lambda x: x == 'QUASI_CLOSED', all_datanodes_container_status) + assert len(count_quasi_closed_container_datanodes) >= 3, \ + "The container should have at least three quasi-closed replicas." + Blockade.blockade_join() + Blockade.blockade_status() + logger.info("Waiting for %s seconds before checking container status", + os.environ["CONTAINER_STATUS_SLEEP"]) + time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) + all_datanodes_container_status = \ + ClusterUtils.findall_container_status( + FILE, INCREASED_SCALE) + count_closed_container_datanodes = filter( + lambda x: x == 'CLOSED', all_datanodes_container_status) + assert len(count_closed_container_datanodes) == 3, \ + "The container should have three closed replicas." + + +def test_three_dns_isolate_threescmfailure(run_second_phase): + """ + In this test, all datanodes are isolated from each other and also cannot + communicate with SCM. + Expectation : + The container replica state in first datanode should be open. + The container replica state in second datanode should be open. + The container replica state in third datanode should be open. + """ + first_set = [OM[0], DATANODES[0]] + second_set = [OM[0], DATANODES[1]] + third_set = [OM[0], DATANODES[2]] + Blockade.blockade_create_partition(first_set, second_set, third_set) + Blockade.blockade_status() + ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") + logger.info("Waiting for %s seconds before checking container status", + os.environ["CONTAINER_STATUS_SLEEP"]) + time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) + all_datanodes_container_status = \ + ClusterUtils.findall_container_status(FILE, SCALE) + first_datanode_status = all_datanodes_container_status[0] + second_datanode_status = all_datanodes_container_status[1] + third_datanode_status = all_datanodes_container_status[2] + assert first_datanode_status == 'OPEN' + assert second_datanode_status == 'OPEN' + assert third_datanode_status == 'OPEN' + + if str(run_second_phase).lower() == "true": + ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False) + Blockade.blockade_status() + logger.info("Waiting for %s seconds before checking container status", + os.environ["CONTAINER_STATUS_SLEEP"]) + time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) + output = ClusterUtils.get_pipelines(FILE) + if output: + assert re.search("Factor:THREE", output) is None + all_datanodes_container_status = \ + ClusterUtils.findall_container_status( + FILE, INCREASED_SCALE) + datanodes_having_container_status = filter( + lambda x: x != 'None', all_datanodes_container_status) + assert len(datanodes_having_container_status) == 3, \ + "Containers should not be replicated on addition of new nodes." + Blockade.blockade_join() + Blockade.blockade_status() + logger.info("Waiting for %s seconds before checking container status", + os.environ["CONTAINER_STATUS_SLEEP"]) + time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) + all_datanodes_container_status = \ + ClusterUtils.findall_container_status( + FILE, INCREASED_SCALE) + count_closed_container_datanodes = filter( + lambda x: x == 'CLOSED', all_datanodes_container_status) + assert len(count_closed_container_datanodes) == 3, \ + "The container should have three closed replicas." \ No newline at end of file diff --git a/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_two_nodes.py b/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_two_nodes.py index a8a6f9b72c..b8df2fa54b 100644 --- a/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_two_nodes.py +++ b/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_two_nodes.py @@ -18,16 +18,17 @@ import os import time import logging +import re from blockadeUtils.blockade import Blockade from clusterUtils.cluster_utils import ClusterUtils - logger = logging.getLogger(__name__) parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) FILE = os.path.join(parent_dir, "compose", "ozoneblockade", "docker-compose.yaml") os.environ["DOCKER_COMPOSE_FILE"] = FILE SCALE = 3 +INCREASED_SCALE = 5 CONTAINER_LIST = [] OM = [] SCM = [] @@ -35,87 +36,138 @@ def setup(): - global CONTAINER_LIST, OM, SCM, DATANODES - Blockade.blockade_destroy() - CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE) - exit_code, output = Blockade.blockade_status() - assert exit_code == 0, "blockade status command failed with output=[%s]" % \ - output - OM = filter(lambda x: 'ozoneManager' in x, CONTAINER_LIST) - SCM = filter(lambda x: 'scm' in x, CONTAINER_LIST) - DATANODES = sorted(list(filter(lambda x: 'datanode' in x, CONTAINER_LIST))) + global CONTAINER_LIST, OM, SCM, DATANODES + Blockade.blockade_destroy() + CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE) + exit_code, output = Blockade.blockade_status() + assert exit_code == 0, "blockade status command failed with output=[%s]" % \ + output + OM, SCM, _, DATANODES = \ + ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST) - exit_code, output = \ - ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") - assert exit_code == 0, "freon run failed with output=[%s]" % output + exit_code, output = \ + ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") + assert exit_code == 0, "freon run failed with output=[%s]" % output def teardown(): - logger.info("Inside teardown") - Blockade.blockade_destroy() + logger.info("Inside teardown") + Blockade.blockade_destroy() def teardown_module(): - ClusterUtils.cluster_destroy(FILE) + ClusterUtils.cluster_destroy(FILE) -def test_two_dns_isolate_scm_same_partition(): - """ - In this test, one of the datanodes (first datanode) cannot communicate - with other two datanodes. - Two datanodes (second datanode and third datanode), on same network - parition, cannot communicate with SCM. - Expectation : - The container replica state in first datanode should be quasi-closed. - The container replica state in second datanode should be open. - The container replica state in third datanode should be open. - """ - first_set = [OM[0], DATANODES[1], DATANODES[2]] - second_set = [OM[0], SCM[0], DATANODES[0]] - Blockade.blockade_create_partition(first_set, second_set) +def test_two_dns_isolate_scm_same_partition(run_second_phase): + """ + In this test, there are three datanodes, DN1, DN2, DN3 + DN1 is on a network partition and + DN2, DN3 are on a different network partition. + DN2 and DN3 cannot communicate with SCM. + Expectation : + The container replica state in DN1 should be quasi-closed. + The container replica state in DN2 should be open. + The container replica state in DN3 should be open. + """ + first_set = [OM[0], DATANODES[1], DATANODES[2]] + second_set = [OM[0], SCM[0], DATANODES[0]] + Blockade.blockade_create_partition(first_set, second_set) + Blockade.blockade_status() + ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") + logger.info("Waiting for %s seconds before checking container status", + os.environ["CONTAINER_STATUS_SLEEP"]) + time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) + all_datanodes_container_status = \ + ClusterUtils.findall_container_status(FILE, SCALE) + first_datanode_status = all_datanodes_container_status[0] + second_datanode_status = all_datanodes_container_status[1] + third_datanode_status = all_datanodes_container_status[2] + assert first_datanode_status == 'QUASI_CLOSED' + assert second_datanode_status == 'OPEN' + assert third_datanode_status == 'OPEN' + + if str(run_second_phase).lower() == "true": + ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False) Blockade.blockade_status() - ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ - ClusterUtils.find_all_datanodes_container_status(FILE, SCALE) - first_datanode_status = all_datanodes_container_status[0] - second_datanode_status = all_datanodes_container_status[1] - third_datanode_status = all_datanodes_container_status[2] - assert first_datanode_status == 'QUASI_CLOSED' - assert second_datanode_status == 'OPEN' - assert third_datanode_status == 'OPEN' - - -def test_two_dns_isolate_scm_different_partition(): - """ - In this test, one of the datanodes (first datanode) cannot communicate with - other two datanodes. - Two datanodes (first datanode and second datanode), - on different network paritions, cannot communicate with SCM. - Expectation : - The container replica state in first datanode should be open. - The container replica states can be either 'closed' - in both second and third datanode, or, - 'open' in second datanode and 'quasi-closed' in third datanode. - """ - first_set = [OM[0], DATANODES[0]] - second_set = [OM[0], DATANODES[1], DATANODES[2]] - third_set = [SCM[0], DATANODES[2]] - Blockade.blockade_create_partition(first_set, second_set, third_set) + ClusterUtils.findall_container_status( + FILE, INCREASED_SCALE) + count_quasi_closed_container_datanodes = filter( + lambda x: x == 'QUASI_CLOSED', all_datanodes_container_status) + assert len(count_quasi_closed_container_datanodes) >= 3, \ + "The container should have at least three quasi-closed replicas." + Blockade.blockade_join() + Blockade.blockade_status() + time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) + all_datanodes_container_status = \ + ClusterUtils.findall_container_status( + FILE, INCREASED_SCALE) + count_closed_container_datanodes = filter( + lambda x: x == 'CLOSED', all_datanodes_container_status) + assert len(count_closed_container_datanodes) >= 3 + + +def test_two_dns_isolate_scm_different_partition(run_second_phase): + """ + In this test, there are three datanodes, DN1, DN2, DN3 + DN1 is on a network partition and + DN2, DN3 are on a different network partition. + DN1 and DN2 cannot communicate with SCM. + Expectation : + The container replica state in datanode DN1 should be open. + The container replica states can be either 'closed' + in DN2 and DN3, or, + 'open' in DN2 and 'quasi-closed' in DN3. + """ + first_set = [OM[0], DATANODES[0]] + second_set = [OM[0], DATANODES[1], DATANODES[2]] + third_set = [SCM[0], DATANODES[2]] + Blockade.blockade_create_partition(first_set, second_set, third_set) + Blockade.blockade_status() + ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") + logger.info("Waiting for %s seconds before checking container status", + os.environ["CONTAINER_STATUS_SLEEP"]) + time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) + all_datanodes_container_status = \ + ClusterUtils.findall_container_status(FILE, SCALE) + first_datanode_status = all_datanodes_container_status[0] + second_datanode_status = all_datanodes_container_status[1] + third_datanode_status = all_datanodes_container_status[2] + assert first_datanode_status == 'OPEN' + assert (second_datanode_status == 'CLOSED' and + third_datanode_status == 'CLOSED') or \ + (second_datanode_status == 'OPEN' and + third_datanode_status == 'QUASI_CLOSED') + + if str(run_second_phase).lower() == "true": + ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False) Blockade.blockade_status() - ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") logger.info("Waiting for %s seconds before checking container status", os.environ["CONTAINER_STATUS_SLEEP"]) time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) all_datanodes_container_status = \ - ClusterUtils.find_all_datanodes_container_status(FILE, SCALE) - first_datanode_status = all_datanodes_container_status[0] - second_datanode_status = all_datanodes_container_status[1] - third_datanode_status = all_datanodes_container_status[2] - assert first_datanode_status == 'OPEN' - assert (second_datanode_status == 'CLOSED' and - third_datanode_status == 'CLOSED') or \ - (second_datanode_status == 'OPEN' and - third_datanode_status == 'QUASI_CLOSED') \ No newline at end of file + ClusterUtils.findall_container_status( + FILE, INCREASED_SCALE) + count_closed_container_datanodes = filter( + lambda x: x == 'CLOSED', all_datanodes_container_status) + count_qausi_closed_container_datanodes = filter( + lambda x: x == 'QUASI_CLOSED', all_datanodes_container_status) + assert len(count_closed_container_datanodes) >= 3 or \ + len(count_qausi_closed_container_datanodes) >= 3 + Blockade.blockade_join() + Blockade.blockade_status() + if len(count_closed_container_datanodes) < 3: + time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"])) + all_datanodes_container_status = \ + ClusterUtils.findall_container_status( + FILE, INCREASED_SCALE) + count_closed_container_datanodes = filter( + lambda x: x == 'CLOSED', all_datanodes_container_status) + assert len(count_closed_container_datanodes) >= 3 + _, output = \ + ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE") + assert re.search("Status: Success", output) is not None \ No newline at end of file diff --git a/hadoop-ozone/dist/src/main/blockade/test_blockade_scm_isolation.py b/hadoop-ozone/dist/src/main/blockade/test_blockade_scm_isolation.py index 54c31e8dd5..06f4263746 100644 --- a/hadoop-ozone/dist/src/main/blockade/test_blockade_scm_isolation.py +++ b/hadoop-ozone/dist/src/main/blockade/test_blockade_scm_isolation.py @@ -42,9 +42,8 @@ def setup(): exit_code, output = Blockade.blockade_status() assert exit_code == 0, "blockade status command failed with output=[%s]" % \ output - OM = [x for x in CONTAINER_LIST if 'ozoneManager' in x] - SCM = [x for x in CONTAINER_LIST if 'scm' in x] - DATANODES = sorted(x for x in CONTAINER_LIST if 'datanode' in x) + OM, SCM, _, DATANODES = \ + ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST) exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")