Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[autorestart] teamd autorestart test fails postcheck #8095

Open
theasianpianist opened this issue Jul 7, 2021 · 1 comment
Open

[autorestart] teamd autorestart test fails postcheck #8095

theasianpianist opened this issue Jul 7, 2021 · 1 comment
Assignees
Labels
Dual ToR Platform ♊ Issues found on dual ToR platforms Issue for 202012

Comments

@theasianpianist
Copy link
Contributor

duthosts = <tests.common.devices.duthosts.DutHosts object at 0x7eff9ed30890>
enum_dut_feature = 'str2-7050cx3-acs-07|teamd'
enum_rand_one_per_hwsku_frontend_hostname = 'str2-7050cx3-acs-07'
tbinfo = {'auto_recover': 'True', 'comment': 'hellogemini', 'conf-name': 'vms17-dual-t0-7050-1', 'duts': ['str2-7050cx3-acs-06', 'str2-7050cx3-acs-07'], ...}

    def test_containers_autorestart(duthosts, enum_dut_feature, enum_rand_one_per_hwsku_frontend_hostname, tbinfo):
        """
        @summary: Test the auto-restart feature of each container against two scenarios: killing
                  a non-critical process to verify the container is still running; killing each
                  critical process to verify the container will be stopped and restarted
        """
        dut_name, feature = decode_dut_port_name(enum_dut_feature)
        pytest_require(dut_name == enum_rand_one_per_hwsku_frontend_hostname and feature != "unknown",
                       "Skip test on dut host {} (chosen {}) feature {}"
                       .format(dut_name, enum_rand_one_per_hwsku_frontend_hostname, feature))
    
        duthost = duthosts[dut_name]
>       run_test_on_single_container(duthost, feature, tbinfo)

dut_name   = 'str2-7050cx3-acs-07'
duthost    = <MultiAsicSonicHost> str2-7050cx3-acs-07
duthosts   = <tests.common.devices.duthosts.DutHosts object at 0x7eff9ed30890>
enum_dut_feature = 'str2-7050cx3-acs-07|teamd'
enum_rand_one_per_hwsku_frontend_hostname = 'str2-7050cx3-acs-07'
feature    = 'teamd'
tbinfo     = {'auto_recover': 'True', 'comment': 'hellogemini', 'conf-name': 'vms17-dual-t0-7050-1', 'duts': ['str2-7050cx3-acs-06', 'str2-7050cx3-acs-07'], ...}

autorestart/test_container_autorestart.py:389: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

duthost = <MultiAsicSonicHost> str2-7050cx3-acs-07, container_name = 'teamd'
tbinfo = {'auto_recover': 'True', 'comment': 'hellogemini', 'conf-name': 'vms17-dual-t0-7050-1', 'duts': ['str2-7050cx3-acs-06', 'str2-7050cx3-acs-07'], ...}

    def run_test_on_single_container(duthost, container_name, tbinfo):
        container_autorestart_states = duthost.get_container_autorestart_states()
        disabled_containers = get_disabled_container_list(duthost)
    
        skip_condition = disabled_containers[:]
        skip_condition.append("database")
        skip_condition.append("acms")
        if tbinfo["topo"]["type"] != "t0":
            skip_condition.append("radv")
    
        # Skip testing the database container, radv container on T1 devices and containers/services which are disabled
        pytest_require(container_name not in skip_condition,
                       "Skipping test for container {}".format(container_name))
    
        is_running = is_container_running(duthost, container_name)
        pytest_assert(is_running, "Container '{}' is not running. Exiting...".format(container_name))
    
        bgp_neighbors = duthost.get_bgp_neighbors()
        up_bgp_neighbors = [ k.lower() for k, v in bgp_neighbors.items() if v["state"] == "established" ]
    
        logger.info("Start testing the container '{}'...".format(container_name))
    
        restore_disabled_state = False
        if container_autorestart_states[container_name] == "disabled":
            logger.info("Change auto-restart state of container '{}' to be 'enabled'".format(container_name))
            duthost.shell("sudo config feature autorestart {} enabled".format(container_name))
            restore_disabled_state = True
    
        # Currently we select 'rsyslogd' as non-critical processes for testing based on
        # the assumption that every container has an 'rsyslogd' process running and it is not
        # considered to be a critical process
        program_status, program_pid = get_program_info(duthost, container_name, "rsyslogd")
        verify_no_autorestart_with_non_critical_process(duthost, container_name, "rsyslogd",
                                                        program_status, program_pid)
    
        critical_group_list, critical_process_list, succeeded = duthost.get_critical_group_and_process_lists(container_name)
        pytest_assert(succeeded, "Failed to get critical group and process lists of container '{}'".format(container_name))
    
        for critical_process in critical_process_list:
            # Skip 'dsserve' process since it was not managed by supervisord
            # TODO: Should remove the following two lines once the issue was solved in the image.
            if container_name == "syncd" and critical_process == "dsserve":
                continue
    
            program_status, program_pid = get_program_info(duthost, container_name, critical_process)
            verify_autorestart_with_critical_process(duthost, container_name, critical_process,
                                                     program_status, program_pid)
            # Sleep 20 seconds in order to let the processes come into live after container is restarted.
            # We will uncomment the following line once the "extended" mode is added
            # time.sleep(20)
            # We are currently only testing one critical process, that is why we use 'break'. Once
            # we add the "extended" mode, we will remove this statement
            break
    
        for critical_group in critical_group_list:
            group_program_info = get_group_program_info(duthost, container_name, critical_group)
            for program_name in group_program_info:
                verify_autorestart_with_critical_process(duthost, container_name, program_name,
                                                         group_program_info[program_name][0],
                                                         group_program_info[program_name][1])
                # We are currently only testing one critical program for each critical group, which is
                # why we use 'break' statement. Once we add the "extended" mode, we will remove this
                # statement
                break
    
        if restore_disabled_state:
            logger.info("Restore auto-restart state of container '{}' to 'disabled'".format(container_name))
            duthost.shell("sudo config feature autorestart {} disabled".format(container_name))
    
        if not postcheck_critical_processes_status(duthost, container_autorestart_states, up_bgp_neighbors):
            config_reload(duthost)
>           pytest.fail("Some post check failed after testing feature {}".format(container_name))
E           Failed: Some post check failed after testing feature teamd

bgp_neighbors = {'10.0.1.57': {'accepted prefixes': 2, 'admin': u'up', 'capabilities': {'peer restart timer': 300}, 'connections dropp...ccepted prefixes': 2, 'admin': u'up', 'capabilities': {'peer restart timer': 300}, 'connections dropped': 0, ...}, ...}
container_autorestart_states = {'acms': 'disabled', 'bgp': 'disabled', 'dhcp_relay': 'disabled', 'lldp': 'disabled', ...}
container_name = 'teamd'
critical_group_list = []
critical_process = 'teammgrd'
critical_process_list = ['teammgrd', 'teamsyncd', 'tlm_teamd']
disabled_containers = []
duthost    = <MultiAsicSonicHost> str2-7050cx3-acs-07
is_running = True
k          = 'fc00::1:7e'
program_pid = 20
program_status = 'RUNNING'
restore_disabled_state = True
skip_condition = ['database', 'acms']
succeeded  = True
tbinfo     = {'auto_recover': 'True', 'comment': 'hellogemini', 'conf-name': 'vms17-dual-t0-7050-1', 'duts': ['str2-7050cx3-acs-06', 'str2-7050cx3-acs-07'], ...}
up_bgp_neighbors = ['10.0.1.63', '10.0.1.61', '10.0.1.59', '10.0.1.57', 'fc00::1:76', 'fc00::1:72', ...]
v          = {'accepted prefixes': 2, 'admin': u'up', 'capabilities': {'peer restart timer': 300}, 'connections dropped': 0, ...}

autorestart/test_container_autorestart.py:372: Failed
@yozhao101
Copy link
Contributor

the nightly auto-restart test of ​Teamd​ container failed on the post-check stage of ​#172​ and ​#173​, here are the nightly test logs https://sonic-jenkins.corp.microsoft.com/job/NewTests/job/TEMPLATE_PYTEST_T0_DUALTOR-7050/172/testReport/autorestart/test_container_autorestart/test_containers_autorestart_str2_7050cx3_acs_07_str2_7050cx3_acs_07_teamd_/ and https://sonic-jenkins.corp.microsoft.com/job/NewTests/job/TEMPLATE_PYTEST_T0_DUALTOR-7050/173/artifact/tests/logs/autorestart/test_container_autorestart.log.

Root reason is both these two nightly tests failed on the ​check_bgp_session_state(...)​ which will check whether the BGP sessions are in established state or not. From the logs, the state of current BGP sessions are in ​active​ state not in the ​established​ state.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Dual ToR Platform ♊ Issues found on dual ToR platforms Issue for 202012
Projects
None yet
Development

No branches or pull requests

2 participants