From 944ce6e5d2d6a7866ab1e0d801cdfbfee569adc2 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Fri, 3 Sep 2021 22:45:26 -0700 Subject: [PATCH 1/8] [process_monitoring] Adding debuggin statement to check the reason why post-check failed. Signed-off-by: Yong Zhao --- .../test_critical_process_monitoring.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/process_monitoring/test_critical_process_monitoring.py b/tests/process_monitoring/test_critical_process_monitoring.py index f0ad919f1f..95d83d3199 100755 --- a/tests/process_monitoring/test_critical_process_monitoring.py +++ b/tests/process_monitoring/test_critical_process_monitoring.py @@ -126,6 +126,7 @@ def check_all_critical_processes_running(duthost): processes_status = duthost.all_critical_process_status() for container_name, processes in processes_status.items(): if processes["status"] is False or len(processes["exited_critical_process"]) > 0: + logger.info("Processes '{}' in Container '{}' are not running ...".format(processes["exited_critical_process"], container_name)) return False return True @@ -143,7 +144,18 @@ def post_test_check(duthost, up_bgp_neighbors): This function will return True if all critical processes are running and all BGP sessions are established. Otherwise it will return False. """ - return check_all_critical_processes_running(duthost) and duthost.check_bgp_session_state(up_bgp_neighbors, "established") + checking_processes_result = check_all_critical_processes_running(duthost) + checking_bgp_sessions_result = duthost.check_bgp_session_state(up_bgp_neighbors, "established") + + if not checking_processes_result: + logger.info("Not all critical processes is running.") + return False + + if not checking_bgp_sessions_result: + logger.info("Not all BGP sessions were established.") + return False + + return True def postcheck_critical_processes_status(duthost, up_bgp_neighbors): From 87959adfc0fee828b027a772f7c19ef38fb337b8 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Tue, 15 Mar 2022 15:18:21 -0700 Subject: [PATCH 2/8] [process_monitoring] Remove unrelated changes. Signed-off-by: Yong Zhao --- .../test_critical_process_monitoring.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/tests/process_monitoring/test_critical_process_monitoring.py b/tests/process_monitoring/test_critical_process_monitoring.py index 67586f3496..c80b136adb 100755 --- a/tests/process_monitoring/test_critical_process_monitoring.py +++ b/tests/process_monitoring/test_critical_process_monitoring.py @@ -149,18 +149,7 @@ def post_test_check(duthost, up_bgp_neighbors): This function will return True if all critical processes are running and all BGP sessions are established. Otherwise it will return False. """ - checking_processes_result = check_all_critical_processes_running(duthost) - checking_bgp_sessions_result = duthost.check_bgp_session_state(up_bgp_neighbors, "established") - - if not checking_processes_result: - logger.info("Not all critical processes is running.") - return False - - if not checking_bgp_sessions_result: - logger.info("Not all BGP sessions were established.") - return False - - return True + return check_all_critical_processes_running(duthost) and duthost.check_bgp_session_state(up_bgp_neighbors, "established") def postcheck_critical_processes_status(duthost, up_bgp_neighbors): From 0ff618118e8977fccaa2fd95505db52c05374df5 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Fri, 8 Apr 2022 11:59:39 -0700 Subject: [PATCH 3/8] [pytest] Add two test cases to test Monit fails to reset its internal counter and test new syntax of Monit is a workaround for this failure. Signed-off-by: Yong Zhao --- tests/memory_checker/test_memory_checker.py | 400 ++++++++++++++++---- 1 file changed, 336 insertions(+), 64 deletions(-) diff --git a/tests/memory_checker/test_memory_checker.py b/tests/memory_checker/test_memory_checker.py index 573437910c..861f883d1d 100644 --- a/tests/memory_checker/test_memory_checker.py +++ b/tests/memory_checker/test_memory_checker.py @@ -1,5 +1,12 @@ """ -Test the feature of memory checker. +The 'stress' utility is leveraged to increase memory usage of a container continuously, then +1) Test whether that container can be restarted by the script ran by Monit. +2) Test whether that container can be restarted; If that container was restarted, then + test the script ran by Monit was unable to restart the container anymore due to Monit + failed to reset its internal counter. +3) Test whether that container can be restarted; If that container was restarted, then + test the script ran by Monit was able to restart the container with the help of new + Monit syntax although Monit failed to reset its internal counter. """ import logging from multiprocessing.pool import ThreadPool @@ -12,6 +19,7 @@ from tests.common.helpers.assertions import pytest_require from tests.common.helpers.dut_utils import check_container_state from tests.common.helpers.dut_utils import decode_dut_and_container_name +from tests.common.helpers.dut_utils import is_container_running from tests.common.plugins.loganalyzer.loganalyzer import LogAnalyzer logger = logging.getLogger(__name__) @@ -26,57 +34,86 @@ CONTAINER_CHECK_INTERVAL_SECS = 1 -@pytest.fixture(autouse=True) -def modify_monit_config_and_restart(duthosts, enum_dut_feature_container, enum_rand_one_per_hwsku_frontend_hostname): - - """Backup Monit configuration files, then customize and restart it before testing. - Restore original Monit configuration files and restart Monit service after testing. +def restart_container(duthost, container_name): + """Restarts the container on DuT. Args: - duthost: Hostname of DuT. + duthost: The AnsibleHost object of DuT. + container_name: A string represents name of the container. Returns: None. """ - dut_name, container_name = decode_dut_and_container_name(enum_dut_feature_container) - pytest_require(dut_name == enum_rand_one_per_hwsku_frontend_hostname and container_name == "telemetry", - "Skips testing memory_checker of container '{}' on the DuT '{}' since another DuT '{}' was chosen." - .format(container_name, dut_name, enum_rand_one_per_hwsku_frontend_hostname)) - duthost = duthosts[dut_name] + logger.info("Restarting '{}' container ...".format(container_name)) + duthost.shell("sudo systemctl restart {}.service".format(container_name)) + logger.info("'{}' is restarted.".format(container_name)) - logger.info("Back up Monit configuration files on DuT '{}' ...".format(duthost.hostname)) + +def backup_monit_config_files(duthost): + """Backs up Monit configuration files on DuT. + Args: + duthost: The AnsibleHost object of DuT. + + Returns: + None. + """ + logger.info("Backing up Monit configuration files on DuT '{}' ...".format(duthost.hostname)) duthost.shell("sudo cp -f /etc/monit/monitrc /tmp/") duthost.shell("sudo cp -f /etc/monit/conf.d/monit_telemetry /tmp/") + logger.info("Monit configuration files on DuT '{}' is backed up.".format(duthost.hostname)) - temp_config_line = ' if status == 3 for 5 times within 10 cycles then exec "/usr/bin/restart_service telemetry"' + +def customize_monit_config_files(duthost, temp_config_line): + """Customizes the Monit configuration file on DuT. + Args: + duthost: The AnsibleHost object of DuT. + temp_config_line: A stirng to replace the initial Monit configuration. + + Returns: + None. + """ logger.info("Modifying Monit config to eliminate start delay and decrease interval ...") duthost.shell("sudo sed -i '$s/^./#/' /etc/monit/conf.d/monit_telemetry") duthost.shell("echo '{}' | sudo tee -a /etc/monit/conf.d/monit_telemetry".format(temp_config_line)) - duthost.shell("sudo sed -i 's/set daemon 60/set daemon 10/' /etc/monit/monitrc") duthost.shell("sudo sed -i '/with start delay 300/s/^./#/' /etc/monit/monitrc") + logger.info("Modifying Monit config to eliminate start delay and decrease interval are done.") - logger.info("Restart Monit service ...") - duthost.shell("sudo systemctl restart monit") - yield +def restore_monit_config_files(duthost): + """Restores the initial Monit configuration file on DuT. + Args: + duthost: The AnsibleHost object of DuT. + + Returns: + None. - logger.info("Restore original Monit configuration files on DuT '{}' ...".format(duthost.hostname)) + """ + logger.info("Restoring original Monit configuration files on DuT '{}' ...".format(duthost.hostname)) duthost.shell("sudo mv -f /tmp/monitrc /etc/monit/") duthost.shell("sudo mv -f /tmp/monit_telemetry /etc/monit/conf.d/") + logger.info("Original Monit configuration files on DuT '{}' is restored.".format(duthost.hostname)) - logger.info("Restart Monit service ...") - duthost.shell("sudo systemctl restart monit") - logger.info("Restore bgp neighbours ...") - duthost.shell("config bgp startup all") +def restart_monit_service(duthost): + """Restarts Monit service on DuT. + Args: + duthost: The AnsibleHost object of DuT. + + Returns: + None. + + """ + logger.info("Restarting Monit service ...") + duthost.shell("sudo systemctl restart monit") + logger.info("Monit service is restarted.") def install_stress_utility(duthost, creds, container_name): - """Installs the 'stress' utility in container. + """Installs 'stress' utility in the container on DuT. Args: duthost: The AnsibleHost object of DuT. - container_name: Name of container. + container_name: A string represents name of the container. Returns: None. @@ -88,7 +125,9 @@ def install_stress_utility(duthost, creds, container_name): https_proxy = creds.get('proxy_env', {}).get('https_proxy', '') # Shutdown bgp for having ability to install stress tool + logger.info("Shuting down all BGP sessions ...") duthost.shell("config bgp shutdown all") + logger.info("All BGP sessions are shut down!...") install_cmd_result = duthost.shell("docker exec {} bash -c 'export http_proxy={} \ && export https_proxy={} \ && apt-get install stress -y'".format(container_name, http_proxy, https_proxy)) @@ -99,7 +138,8 @@ def install_stress_utility(duthost, creds, container_name): def remove_stress_utility(duthost, container_name): - """Removes the 'stress' utility from container. + """Removes the 'stress' utility from the container and brings up BGP sessions + on DuT. Args: duthost: The AnsibleHost object of DuT. @@ -114,6 +154,44 @@ def remove_stress_utility(duthost, container_name): pytest_assert(exit_code == 0, "Failed to remove 'stress' utility!") logger.info("'stress' utility was removed.") + logger.info("Bringing up all BGP sessions ...") + duthost.shell("config bgp startup all") + logger.info("BGP sessions are started up.") + + +@pytest.fixture +def test_setup_and_cleanup(duthosts, creds, enum_dut_feature_container, + enum_rand_one_per_hwsku_frontend_hostname, request): + """Backups Monit configuration files, customizes configuration files and + restarts Monit service before testing. Restores original Monit configuration files + and restart Monit service after testing. + + Args: + duthost: Hostname of DuT. + + Returns: + None. + """ + dut_name, container_name = decode_dut_and_container_name(enum_dut_feature_container) + pytest_require(dut_name == enum_rand_one_per_hwsku_frontend_hostname and container_name == "telemetry", + "Skips testing memory_checker of container '{}' on the DuT '{}' since another DuT '{}' was chosen." + .format(container_name, dut_name, enum_rand_one_per_hwsku_frontend_hostname)) + duthost = duthosts[dut_name] + + install_stress_utility(duthost, creds, container_name) + backup_monit_config_files(duthost) + + customize_monit_config_files(duthost, request.param) + restart_monit_service(duthost) + + yield + + restore_monit_config_files(duthost) + restart_monit_service(duthost) + restart_container(duthost, container_name) + remove_stress_utility(duthost, container_name) + postcheck_critical_processes(duthost, container_name) + def consume_memory(duthost, container_name, vm_workers): """Consumes memory more than the threshold value of specified container. @@ -131,7 +209,44 @@ def consume_memory(duthost, container_name, vm_workers): duthost.shell("docker exec {} stress -m {}".format(container_name, vm_workers), module_ignore_errors=True) -def consume_memory_and_restart_container(duthost, container_name, vm_workers, loganalyzer, marker): +def check_critical_processes(duthost, container_name): + """Checks whether the critical processes are running after container was restarted. + + Args: + duthost: The AnsibleHost object of DuT. + container_name: Name of container. + + Returns: + None. + """ + status_result = duthost.critical_process_status(container_name) + if status_result["status"] is False or len(status_result["exited_critical_process"]) > 0: + return False + + return True + + +def postcheck_critical_processes(duthost, container_name): + """Checks whether the critical processes are running after container was restarted. + + Args: + duthost: The AnsibleHost object of DuT. + container_name: Name of container. + + Returns: + None. + """ + logger.info("Checking the running status of critical processes in '{}' container ..." + .format(container_name)) + is_succeeded = wait_until(CONTAINER_RESTART_THRESHOLD_SECS, CONTAINER_CHECK_INTERVAL_SECS, 0, + check_critical_processes, duthost, container_name) + if not is_succeeded: + pytest.fail("Not all critical processes in '{}' container are running!" + .format(container_name)) + logger.info("All critical processes in '{}' container are running.".format(container_name)) + + +def consumes_memory_and_checks_container_restart(duthost, container_name, vm_workers): """Invokes the 'stress' utility to consume memory more than the threshold asynchronously and checks whether the container can be stopped and restarted. Loganalyzer was leveraged to check whether the log messages related to container stopped were generated. @@ -146,11 +261,23 @@ def consume_memory_and_restart_container(duthost, container_name, vm_workers, lo None. """ + expected_alerting_messages = [] + expected_alerting_messages.append(".*restart_service.*Restarting service 'telemetry'.*") + expected_alerting_messages.append(".*Stopping Telemetry container.*") + expected_alerting_messages.append(".*Stopped Telemetry container.*") + expected_alerting_messages.append(".*Starting Telemetry container.*") + expected_alerting_messages.append(".*Started Telemetry container.*") + + loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix="container_restart_due_to_memory") + loganalyzer.expect_regex = [] + loganalyzer.expect_regex.extend(expected_alerting_messages) + marker = loganalyzer.init() + thread_pool = ThreadPool() thread_pool.apply_async(consume_memory, (duthost, container_name, vm_workers)) - logger.info("Sleep 100 seconds to wait for the alerting messages from syslog...") - time.sleep(100) + logger.info("Sleep 130 seconds to wait for the alerting messages from syslog ...") + time.sleep(130) logger.info("Checking the alerting messages related to container stopped ...") loganalyzer.analyze(marker) @@ -165,47 +292,108 @@ def consume_memory_and_restart_container(duthost, container_name, vm_workers, lo logger.info("'{}' container is restarted.".format(container_name)) -def check_critical_processes(duthost, container_name): - """Checks whether the critical processes are running after container was restarted. +def get_container_mem_usage(duthost, container_name): + """Gets the memory usage of a container. Args: duthost: The AnsibleHost object of DuT. - container_name: Name of container. + container_name: A string represents the name of container. Returns: - None. + mem_usage: A string represents memory usage. """ - status_result = duthost.critical_process_status(container_name) - if status_result["status"] is False or len(status_result["exited_critical_process"]) > 0: - return False + get_mem_usage_cmd = "docker stats --no-stream --format \{{\{{.MemUsage\}}\}} {}".format(container_name) + cmd_result = duthost.shell(get_mem_usage_cmd) - return True + exit_code = cmd_result["rc"] + pytest_assert(exit_code == 0, "Failed to get memory usage of '{}'!".format(container_name)) + mem_info = cmd_result["stdout_lines"] + mem_usage = mem_info[0].split()[0] -def postcheck_critical_processes(duthost, container_name): - """Checks whether the critical processes are running after container was restarted. + return mem_usage + + +def consumes_memory_and_checks_monit(duthost, container_name, vm_workers, new_syntax_enabled): + """Invokes the 'stress' utility to consume memory more than the threshold asynchronously + and checks whether the container can be stopped and restarted. After container was restarted, + 'stress' utility will be invoked again to consume memory and checks whether Monit was able to + restart this contianer or not with or without help of new syntax. + Loganalyzer is leveraged to check whether the log messages related to container stopped + and started were generated. Args: duthost: The AnsibleHost object of DuT. container_name: Name of container. + vm_workers: Number of workers which does the spinning on malloc()/free() + to consume memory. Returns: None. + """ - logger.info("Checking the running status of critical processes in '{}' container ..." - .format(container_name)) - is_succeeded = wait_until(CONTAINER_RESTART_THRESHOLD_SECS, CONTAINER_CHECK_INTERVAL_SECS, 0, - check_critical_processes, duthost, container_name) - if not is_succeeded: - pytest.fail("Not all critical processes in '{}' container are running!" - .format(container_name)) - logger.info("All critical processes in '{}' container are running.".format(container_name)) + expected_alerting_messages = [] + expected_alerting_messages.append(".*restart_service.*Restarting service 'telemetry'.*") + expected_alerting_messages.append(".*Stopping Telemetry container.*") + expected_alerting_messages.append(".*Stopped Telemetry container.*") + expected_alerting_messages.append(".*Starting Telemetry container.*") + expected_alerting_messages.append(".*Started Telemetry container.*") + + loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix="test_memory_checker") + loganalyzer.expect_regex = [] + loganalyzer.expect_regex.extend(expected_alerting_messages) + marker = loganalyzer.init() + + thread_pool = ThreadPool() + thread_pool.apply_async(consume_memory, (duthost, container_name, vm_workers)) + + logger.info("Sleep 130 seconds to wait for the alerting messages from syslog ...") + time.sleep(130) + + logger.info("Checking the alerting messages related to container restart ...") + loganalyzer.analyze(marker) + logger.info("Found all the expected alerting messages from syslog!") + + logger.info("Waiting for '{}' container to be restarted ...".format(container_name)) + restarted = wait_until(CONTAINER_RESTART_THRESHOLD_SECS, + CONTAINER_CHECK_INTERVAL_SECS, + 0, + check_container_state, duthost, container_name, True) + pytest_assert(restarted, "Failed to restart '{}' container!".format(container_name)) + logger.info("'{}' container is restarted.".format(container_name)) + logger.info("Running 'stress' utility again in '{}' ...".format(container_name)) + thread_pool.apply_async(consume_memory, (duthost, container_name, vm_workers)) -def test_memory_checker(duthosts, enum_dut_feature_container, creds, enum_rand_one_per_hwsku_frontend_hostname): - """Checks whether the telemetry container can be restarted or not if the memory - usage of it is beyond the threshold. The `stress` utility is leveraged as - the memory stressing tool. + check_counter = 0 + marker = loganalyzer.update_marker_prefix("test_monit_counter") + logger.info("Checking memory usage of '{}' every 30 seconds for 6 times ...".format(container_name)) + while check_counter < 6: + check_counter += 1 + mem_usage = get_container_mem_usage(duthost, container_name) + logger.info("Memory usage of '{}' is '{}'".format(container_name, mem_usage)) + time.sleep(30) + + logger.info("Analyzing syslog messages to verify whether '{}' is restarted ...".format(container_name)) + analyzing_result = loganalyzer.analyze(marker, fail=False) + if not new_syntax_enabled: + pytest_assert(analyzing_result["total"]["expected_match"] == 0, + "Monit can reset counter and restart '{}'!".format(container_name)) + logger.info("Monit was unable to reset its counter and '{}' can not be restarted!".format(container_name)) + else: + pytest_assert(analyzing_result["total"]["expected_match"] == 5, + "Monit still can not restart '{}' with the help of new syntax!".format(container_name)) + logger.info("Monit was able to restart '{}' with the help of new syntax!".format(container_name)) + + +@pytest.mark.parametrize("test_setup_and_cleanup", + [' if status == 3 for 1 times within 2 cycles then exec "/usr/bin/restart_service telemetry"'], + indirect=["test_setup_and_cleanup"]) +def test_memory_checker(duthosts, enum_dut_feature_container, test_setup_and_cleanup, + enum_rand_one_per_hwsku_frontend_hostname): + """Checks whether the container can be restarted or not if the memory + usage of it is beyond its threshold for specfic times within a sliding window. + The `stress` utility is leveraged as the memory stressing tool. Args: duthosts: The fixture returns list of DuTs. @@ -225,25 +413,109 @@ def test_memory_checker(duthosts, enum_dut_feature_container, creds, enum_rand_o # and number of vm_workers is hard coded. We will extend this testing on all containers after # the feature 'memory_checker' is fully implemented. container_name = "telemetry" - vm_workers = 4 + vm_workers = 6 pytest_require("Celestica-E1031" not in duthost.facts["hwsku"] and (("20191130" in duthost.os_version and parse_version(duthost.os_version) > parse_version("20191130.72")) or parse_version(duthost.kernel_version) > parse_version("4.9.0")), "Test is not supported for platform Celestica E1031, 20191130.72 and older image versions!") - expected_alerting_messages = [] - loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix="container_restart_due_to_memory") - loganalyzer.expect_regex = [] - expected_alerting_messages.append(".*restart_service.*Restarting service 'telemetry'.*") - expected_alerting_messages.append(".*Stopping Telemetry container.*") - expected_alerting_messages.append(".*Stopped Telemetry container.*") + if not is_container_running(duthost, container_name): + pytest.fail("'{}' is nor running!".format(container_name)) - loganalyzer.expect_regex.extend(expected_alerting_messages) - marker = loganalyzer.init() + consumes_memory_and_checks_container_restart(duthost, container_name, vm_workers) - install_stress_utility(duthost, creds, container_name) - consume_memory_and_restart_container(duthost, container_name, vm_workers, loganalyzer, marker) - remove_stress_utility(duthost, container_name) - postcheck_critical_processes(duthost, container_name) +@pytest.mark.parametrize("test_setup_and_cleanup", + [' if status == 3 for 1 times within 2 cycles then exec "/usr/bin/restart_service telemetry"'], + indirect=["test_setup_and_cleanup"]) +def test_monit_reset_counter_failure(duthosts, enum_dut_feature_container, test_setup_and_cleanup, + enum_rand_one_per_hwsku_frontend_hostname): + """Checks that Monit was unable to reset its counter. Specifically Monit will restart + the contianer if memory usage of it is larger than the threshold for specific times within + a sliding window. However, Monit was unable to restart the contianer anymore if memory usage is + still larger than the threshold continuoulsy since Monit failed to reset its internal counter. + The `stress` utility is leveraged as the memory stressing tool. + + Args: + duthosts: The fixture returns list of DuTs. + test_setup_and_cleanup: Fixture to setup prerequisites before and after testing. + enum_rand_one_per_hwsku_frontend_hostname: The fixture randomly pick up + a frontend DuT from testbed. + + Returns: + None. + """ + dut_name, container_name = decode_dut_and_container_name(enum_dut_feature_container) + pytest_require(dut_name == enum_rand_one_per_hwsku_frontend_hostname and container_name == "telemetry", + "Skips testing memory_checker of container '{}' on the DuT '{}' since another DuT '{}' was chosen." + .format(container_name, dut_name, enum_rand_one_per_hwsku_frontend_hostname)) + duthost = duthosts[dut_name] + + # TODO: Currently we only test 'telemetry' container which has the memory threshold 400MB + # and number of vm_workers is hard coded. We will extend this testing on all containers after + # the feature 'memory_checker' is fully implemented. + container_name = "telemetry" + vm_workers = 6 + + pytest_require("Celestica-E1031" not in duthost.facts["hwsku"] + and (("20191130" in duthost.os_version and parse_version(duthost.os_version) > parse_version("20191130.72")) + or parse_version(duthost.kernel_version) > parse_version("4.9.0")), + "Test is not supported for platform Celestica E1031, 20191130.72 and older image versions!") + + logger.info("Checks whether '{}' is running ...".format(container_name)) + is_running = wait_until(CONTAINER_RESTART_THRESHOLD_SECS, + CONTAINER_CHECK_INTERVAL_SECS, + 0, + check_container_state, duthost, container_name, True) + pytest_assert(is_running, "'{}' is not running on DuT!".format(container_name)) + logger.info("'{}' is running on DuT!".format(container_name)) + + consumes_memory_and_checks_monit(duthost, container_name, vm_workers, False) + + +@pytest.mark.parametrize("test_setup_and_cleanup", + [' if status == 3 for 1 times within 2 cycles then exec "/usr/bin/restart_service telemetry" repeat every 2 cycles'], + indirect=["test_setup_and_cleanup"]) +def test_monit_new_syntax(duthosts, enum_dut_feature_container, test_setup_and_cleanup, + enum_rand_one_per_hwsku_frontend_hostname): + """Checks that new syntax of Monit can mitigate the issue which shows Monit was unable + to restart container due to failing reset its internal counter. With the help of this syntax, + the culprit container can be restarted by Monit if memory usage of it is larger than the threshold + for specific times continuously. + + Args: + duthosts: The fixture returns list of DuTs. + test_setup_and_cleanup: Fixture to setup prerequisites before and after testing. + enum_rand_one_per_hwsku_frontend_hostname: The fixture randomly pick up + a frontend DuT from testbed. + + Returns: + None. + """ + dut_name, container_name = decode_dut_and_container_name(enum_dut_feature_container) + pytest_require(dut_name == enum_rand_one_per_hwsku_frontend_hostname and container_name == "telemetry", + "Skips testing memory_checker of container '{}' on the DuT '{}' since another DuT '{}' was chosen." + .format(container_name, dut_name, enum_rand_one_per_hwsku_frontend_hostname)) + duthost = duthosts[dut_name] + + # TODO: Currently we only test 'telemetry' container which has the memory threshold 400MB + # and number of vm_workers is hard coded. We will extend this testing on all containers after + # the feature 'memory_checker' is fully implemented. + container_name = "telemetry" + vm_workers = 6 + + pytest_require("Celestica-E1031" not in duthost.facts["hwsku"] + and (("20191130" in duthost.os_version and parse_version(duthost.os_version) > parse_version("20191130.72")) + or parse_version(duthost.kernel_version) > parse_version("4.9.0")), + "Test is not supported for platform Celestica E1031, 20191130.72 and older image versions!") + + logger.info("Checks whether '{}' is running ...".format(container_name)) + is_running = wait_until(CONTAINER_RESTART_THRESHOLD_SECS, + CONTAINER_CHECK_INTERVAL_SECS, + 0, + check_container_state, duthost, container_name, True) + pytest_assert(is_running, "'{}' is not running on DuT!".format(container_name)) + logger.info("'{}' is running on DuT!".format(container_name)) + + consumes_memory_and_checks_monit(duthost, container_name, vm_workers, True) From 26967c20f681006f61e1e79041674104167794dd Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Fri, 8 Apr 2022 12:36:41 -0700 Subject: [PATCH 4/8] [pytest] Delete blank lines and reword comments. Signed-off-by: Yong Zhao --- tests/memory_checker/test_memory_checker.py | 39 ++++++++++----------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/tests/memory_checker/test_memory_checker.py b/tests/memory_checker/test_memory_checker.py index 861f883d1d..c9d7fb1196 100644 --- a/tests/memory_checker/test_memory_checker.py +++ b/tests/memory_checker/test_memory_checker.py @@ -1,12 +1,12 @@ """ -The 'stress' utility is leveraged to increase memory usage of a container continuously, then +The 'stress' utility is leveraged to increase th memory usage of a container continuously, then 1) Test whether that container can be restarted by the script ran by Monit. -2) Test whether that container can be restarted; If that container was restarted, then - test the script ran by Monit was unable to restart the container anymore due to Monit - failed to reset its internal counter. -3) Test whether that container can be restarted; If that container was restarted, then - test the script ran by Monit was able to restart the container with the help of new - Monit syntax although Monit failed to reset its internal counter. +2) Test whether that container can be restarted by the script ran by Monit; If that container + was restarted, then test the script ran by Monit was unable to restart the container anymore + due to Monit failed to reset its internal counter. +3) Test whether that container can be restarted by the script ran by Monit; If that container + was restarted, then test the script ran by Monit was able to restart the container with the + help of new Monit syntax although Monit failed to reset its internal counter. """ import logging from multiprocessing.pool import ThreadPool @@ -86,12 +86,11 @@ def restore_monit_config_files(duthost): Returns: None. - """ logger.info("Restoring original Monit configuration files on DuT '{}' ...".format(duthost.hostname)) duthost.shell("sudo mv -f /tmp/monitrc /etc/monit/") duthost.shell("sudo mv -f /tmp/monit_telemetry /etc/monit/conf.d/") - logger.info("Original Monit configuration files on DuT '{}' is restored.".format(duthost.hostname)) + logger.info("Original Monit configuration files on DuT '{}' are restored.".format(duthost.hostname)) def restart_monit_service(duthost): @@ -101,7 +100,6 @@ def restart_monit_service(duthost): Returns: None. - """ logger.info("Restarting Monit service ...") duthost.shell("sudo systemctl restart monit") @@ -125,7 +123,7 @@ def install_stress_utility(duthost, creds, container_name): https_proxy = creds.get('proxy_env', {}).get('https_proxy', '') # Shutdown bgp for having ability to install stress tool - logger.info("Shuting down all BGP sessions ...") + logger.info("Shutting down all BGP sessions ...") duthost.shell("config bgp shutdown all") logger.info("All BGP sessions are shut down!...") install_cmd_result = duthost.shell("docker exec {} bash -c 'export http_proxy={} \ @@ -138,12 +136,12 @@ def install_stress_utility(duthost, creds, container_name): def remove_stress_utility(duthost, container_name): - """Removes the 'stress' utility from the container and brings up BGP sessions + """Removes the 'stress' utility from container and brings up BGP sessions on DuT. Args: duthost: The AnsibleHost object of DuT. - container_name: Name of container. + container_name: A string represents the name of container. Returns: None. @@ -162,7 +160,7 @@ def remove_stress_utility(duthost, container_name): @pytest.fixture def test_setup_and_cleanup(duthosts, creds, enum_dut_feature_container, enum_rand_one_per_hwsku_frontend_hostname, request): - """Backups Monit configuration files, customizes configuration files and + """Backups Monit configuration files, customizes Monit configuration files and restarts Monit service before testing. Restores original Monit configuration files and restart Monit service after testing. @@ -179,8 +177,8 @@ def test_setup_and_cleanup(duthosts, creds, enum_dut_feature_container, duthost = duthosts[dut_name] install_stress_utility(duthost, creds, container_name) - backup_monit_config_files(duthost) + backup_monit_config_files(duthost) customize_monit_config_files(duthost, request.param) restart_monit_service(duthost) @@ -188,6 +186,7 @@ def test_setup_and_cleanup(duthosts, creds, enum_dut_feature_container, restore_monit_config_files(duthost) restart_monit_service(duthost) + restart_container(duthost, container_name) remove_stress_utility(duthost, container_name) postcheck_critical_processes(duthost, container_name) @@ -248,18 +247,17 @@ def postcheck_critical_processes(duthost, container_name): def consumes_memory_and_checks_container_restart(duthost, container_name, vm_workers): """Invokes the 'stress' utility to consume memory more than the threshold asynchronously - and checks whether the container can be stopped and restarted. Loganalyzer was leveraged + and checks whether the container can be stopped and restarted. Loganalyzer is leveraged to check whether the log messages related to container stopped were generated. Args: duthost: The AnsibleHost object of DuT. - container_name: Name of container. + container_name: A string represents the name of container. vm_workers: Number of workers which does the spinning on malloc()/free() to consume memory. Returns: None. - """ expected_alerting_messages = [] expected_alerting_messages.append(".*restart_service.*Restarting service 'telemetry'.*") @@ -318,7 +316,7 @@ def consumes_memory_and_checks_monit(duthost, container_name, vm_workers, new_sy """Invokes the 'stress' utility to consume memory more than the threshold asynchronously and checks whether the container can be stopped and restarted. After container was restarted, 'stress' utility will be invoked again to consume memory and checks whether Monit was able to - restart this contianer or not with or without help of new syntax. + restart this contianer with or without help of new syntax. Loganalyzer is leveraged to check whether the log messages related to container stopped and started were generated. @@ -327,10 +325,11 @@ def consumes_memory_and_checks_monit(duthost, container_name, vm_workers, new_sy container_name: Name of container. vm_workers: Number of workers which does the spinning on malloc()/free() to consume memory. + new_syntax_enabled: Checks to make sure container will be restarted if it is set to be + `True`. Returns: None. - """ expected_alerting_messages = [] expected_alerting_messages.append(".*restart_service.*Restarting service 'telemetry'.*") From 3754436cf9722918478f5bf02d598a44480ed861 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Fri, 8 Apr 2022 12:46:15 -0700 Subject: [PATCH 5/8] [pytest] Fix a typo. Signed-off-by: Yong Zhao --- tests/memory_checker/test_memory_checker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/memory_checker/test_memory_checker.py b/tests/memory_checker/test_memory_checker.py index c9d7fb1196..baf6e04d9d 100644 --- a/tests/memory_checker/test_memory_checker.py +++ b/tests/memory_checker/test_memory_checker.py @@ -1,5 +1,5 @@ """ -The 'stress' utility is leveraged to increase th memory usage of a container continuously, then +The 'stress' utility is leveraged to increase the memory usage of a container continuously, then 1) Test whether that container can be restarted by the script ran by Monit. 2) Test whether that container can be restarted by the script ran by Monit; If that container was restarted, then test the script ran by Monit was unable to restart the container anymore From d5a28018a5cc8d41b528d2704f38727829cc75a8 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Wed, 20 Apr 2022 19:48:03 -0700 Subject: [PATCH 6/8] [memory_checker] Address the comments such as removing the `sudo`. Signed-off-by: Yong Zhao --- tests/memory_checker/test_memory_checker.py | 36 +++++++++++---------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/tests/memory_checker/test_memory_checker.py b/tests/memory_checker/test_memory_checker.py index baf6e04d9d..05ffd71234 100644 --- a/tests/memory_checker/test_memory_checker.py +++ b/tests/memory_checker/test_memory_checker.py @@ -32,6 +32,7 @@ CONTAINER_STOP_THRESHOLD_SECS = 200 CONTAINER_RESTART_THRESHOLD_SECS = 180 CONTAINER_CHECK_INTERVAL_SECS = 1 +WAITING_SYSLOG_MSG_SECS = 130 def restart_container(duthost, container_name): @@ -45,7 +46,7 @@ def restart_container(duthost, container_name): None. """ logger.info("Restarting '{}' container ...".format(container_name)) - duthost.shell("sudo systemctl restart {}.service".format(container_name)) + duthost.shell("systemctl restart {}.service".format(container_name)) logger.info("'{}' is restarted.".format(container_name)) @@ -58,8 +59,9 @@ def backup_monit_config_files(duthost): None. """ logger.info("Backing up Monit configuration files on DuT '{}' ...".format(duthost.hostname)) - duthost.shell("sudo cp -f /etc/monit/monitrc /tmp/") - duthost.shell("sudo cp -f /etc/monit/conf.d/monit_telemetry /tmp/") + duthost.shell("cp -f /etc/monit/monitrc /tmp/") + duthost.shell("mv -f /etc/monit/conf.d/monit_* /tmp/") + duthost.shell("mv -f /tmp/monit_telemetry /etc/monit/conf.d/") logger.info("Monit configuration files on DuT '{}' is backed up.".format(duthost.hostname)) @@ -73,9 +75,9 @@ def customize_monit_config_files(duthost, temp_config_line): None. """ logger.info("Modifying Monit config to eliminate start delay and decrease interval ...") - duthost.shell("sudo sed -i '$s/^./#/' /etc/monit/conf.d/monit_telemetry") - duthost.shell("echo '{}' | sudo tee -a /etc/monit/conf.d/monit_telemetry".format(temp_config_line)) - duthost.shell("sudo sed -i '/with start delay 300/s/^./#/' /etc/monit/monitrc") + duthost.shell("sed -i '$s/^./#/' /etc/monit/conf.d/monit_telemetry") + duthost.shell("echo '{}' | tee -a /etc/monit/conf.d/monit_telemetry".format(temp_config_line)) + duthost.shell("sed -i '/with start delay 300/s/^./#/' /etc/monit/monitrc") logger.info("Modifying Monit config to eliminate start delay and decrease interval are done.") @@ -88,8 +90,8 @@ def restore_monit_config_files(duthost): None. """ logger.info("Restoring original Monit configuration files on DuT '{}' ...".format(duthost.hostname)) - duthost.shell("sudo mv -f /tmp/monitrc /etc/monit/") - duthost.shell("sudo mv -f /tmp/monit_telemetry /etc/monit/conf.d/") + duthost.shell("mv -f /tmp/monitrc /etc/monit/") + duthost.shell("mv -f /tmp/monit_* /etc/monit/conf.d/") logger.info("Original Monit configuration files on DuT '{}' are restored.".format(duthost.hostname)) @@ -102,7 +104,7 @@ def restart_monit_service(duthost): None. """ logger.info("Restarting Monit service ...") - duthost.shell("sudo systemctl restart monit") + duthost.shell("systemctl restart monit") logger.info("Monit service is restarted.") @@ -274,8 +276,8 @@ def consumes_memory_and_checks_container_restart(duthost, container_name, vm_wor thread_pool = ThreadPool() thread_pool.apply_async(consume_memory, (duthost, container_name, vm_workers)) - logger.info("Sleep 130 seconds to wait for the alerting messages from syslog ...") - time.sleep(130) + logger.info("Sleep '{}' seconds to wait for the alerting messages from syslog ...".format(WAITING_SYSLOG_MSG_SECS)) + time.sleep(WAITING_SYSLOG_MSG_SECS) logger.info("Checking the alerting messages related to container stopped ...") loganalyzer.analyze(marker) @@ -316,7 +318,7 @@ def consumes_memory_and_checks_monit(duthost, container_name, vm_workers, new_sy """Invokes the 'stress' utility to consume memory more than the threshold asynchronously and checks whether the container can be stopped and restarted. After container was restarted, 'stress' utility will be invoked again to consume memory and checks whether Monit was able to - restart this contianer with or without help of new syntax. + restart this container with or without help of new syntax. Loganalyzer is leveraged to check whether the log messages related to container stopped and started were generated. @@ -346,8 +348,8 @@ def consumes_memory_and_checks_monit(duthost, container_name, vm_workers, new_sy thread_pool = ThreadPool() thread_pool.apply_async(consume_memory, (duthost, container_name, vm_workers)) - logger.info("Sleep 130 seconds to wait for the alerting messages from syslog ...") - time.sleep(130) + logger.info("Sleep '{}' seconds to wait for the alerting messages from syslog ...".format(WAITING_SYSLOG_MSG_SECS)) + time.sleep(WAITING_SYSLOG_MSG_SECS) logger.info("Checking the alerting messages related to container restart ...") loganalyzer.analyze(marker) @@ -380,7 +382,7 @@ def consumes_memory_and_checks_monit(duthost, container_name, vm_workers, new_sy "Monit can reset counter and restart '{}'!".format(container_name)) logger.info("Monit was unable to reset its counter and '{}' can not be restarted!".format(container_name)) else: - pytest_assert(analyzing_result["total"]["expected_match"] == 5, + pytest_assert(analyzing_result["total"]["expected_match"] == len(expected_alerting_messages), "Monit still can not restart '{}' with the help of new syntax!".format(container_name)) logger.info("Monit was able to restart '{}' with the help of new syntax!".format(container_name)) @@ -431,8 +433,8 @@ def test_memory_checker(duthosts, enum_dut_feature_container, test_setup_and_cle def test_monit_reset_counter_failure(duthosts, enum_dut_feature_container, test_setup_and_cleanup, enum_rand_one_per_hwsku_frontend_hostname): """Checks that Monit was unable to reset its counter. Specifically Monit will restart - the contianer if memory usage of it is larger than the threshold for specific times within - a sliding window. However, Monit was unable to restart the contianer anymore if memory usage is + the contanier if memory usage of it is larger than the threshold for specific times within + a sliding window. However, Monit was unable to restart the container anymore if memory usage is still larger than the threshold continuoulsy since Monit failed to reset its internal counter. The `stress` utility is leveraged as the memory stressing tool. From b8b433ef322ca63f28b0c4af8758c108c129fdb4 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Wed, 20 Apr 2022 23:47:47 -0700 Subject: [PATCH 7/8] [memory_checker] Need backup `telemetry` configuration file, make the change during testing and restore it after testing. Signed-off-by: Yong Zhao --- tests/memory_checker/test_memory_checker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/memory_checker/test_memory_checker.py b/tests/memory_checker/test_memory_checker.py index 05ffd71234..2d2ae808c9 100644 --- a/tests/memory_checker/test_memory_checker.py +++ b/tests/memory_checker/test_memory_checker.py @@ -61,7 +61,7 @@ def backup_monit_config_files(duthost): logger.info("Backing up Monit configuration files on DuT '{}' ...".format(duthost.hostname)) duthost.shell("cp -f /etc/monit/monitrc /tmp/") duthost.shell("mv -f /etc/monit/conf.d/monit_* /tmp/") - duthost.shell("mv -f /tmp/monit_telemetry /etc/monit/conf.d/") + duthost.shell("cp -f /tmp/monit_telemetry /etc/monit/conf.d/") logger.info("Monit configuration files on DuT '{}' is backed up.".format(duthost.hostname)) From 3f6ae36cf3a506af0a8c292aed20e2ecd398c5c6 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Thu, 21 Apr 2022 23:38:04 -0700 Subject: [PATCH 8/8] [memory_checker] Use `apt-get purge ...` instead of `apt-get remove ...` to delete binary file and also the related configuration files. Signed-off-by: Yong Zhao --- tests/memory_checker/test_memory_checker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/memory_checker/test_memory_checker.py b/tests/memory_checker/test_memory_checker.py index 2d2ae808c9..f92f22c51b 100644 --- a/tests/memory_checker/test_memory_checker.py +++ b/tests/memory_checker/test_memory_checker.py @@ -149,7 +149,7 @@ def remove_stress_utility(duthost, container_name): None. """ logger.info("Removing 'stress' utility from '{}' container ...".format(container_name)) - remove_cmd_result = duthost.shell("docker exec {} apt-get remove stress -y".format(container_name)) + remove_cmd_result = duthost.shell("docker exec {} apt-get purge stress -y".format(container_name)) exit_code = remove_cmd_result["rc"] pytest_assert(exit_code == 0, "Failed to remove 'stress' utility!") logger.info("'stress' utility was removed.")