Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[sanity] mux simulator sanity check reports active side mismatch #8193

Closed
theasianpianist opened this issue Jul 16, 2021 · 2 comments
Closed
Assignees
Labels
Dual ToR Platform ♊ Issues found on dual ToR platforms

Comments

@theasianpianist
Copy link
Contributor

localhost = <tests.common.devices.local.Localhost object at 0x7f42f8e84d50>
duthosts = <tests.common.devices.duthosts.DutHosts object at 0x7f42fa944e90>
request = <SubRequest 'sanity_check' for <Function test_dscp_to_queue_during_decap_on_active>>
fanouthosts = {'str2-7260cx3-acs-fan-10': { os: 'eos', hostname: 'str2-7260cx3-acs-fan-10', device_type: 'FanoutLeaf' }, 'str2-7260cx3-acs-fan-11': { os: 'eos', hostname: 'str2-7260cx3-acs-fan-11', device_type: 'FanoutLeaf' }}
nbrhosts = {'ARISTA01T1': {'conf': {'bgp': {'asn': 64600, 'peers': {65100: ['10.0.0.56', 'FC00::71', '10.0.1.56', 'FC00::1:71']}}...'fc00::7e/126'}, ...}, 'properties': ['common']}, 'host': <tests.common.devices.eos.EosHost object at 0x7f42f17dc8d0>}}
tbinfo = {'auto_recover': 'True', 'comment': 'hellogemini', 'conf-name': 'vms17-dual-t0-7050-1', 'duts': ['str2-7050cx3-acs-06', 'str2-7050cx3-acs-07'], ...}

    @pytest.fixture(scope="module", autouse=True)
    def sanity_check(localhost, duthosts, request, fanouthosts, nbrhosts, tbinfo):
        logger.info("Prepare sanity check")
    
        skip_sanity = False
        allow_recover = False
        recover_method = "adaptive"
        pre_check_items = set(copy.deepcopy(SUPPORTED_CHECKS))  # Default check items
        post_check = False
    
        customized_sanity_check = None
        for m in request.node.iter_markers():
            logger.info("Found marker: m.name=%s, m.args=%s, m.kwargs=%s" % (m.name, m.args, m.kwargs))
            if m.name == "sanity_check":
                customized_sanity_check = m
                break
    
        if customized_sanity_check:
            logger.info("Process marker {} in script. m.args={}, m.kwargs={}"
                .format(customized_sanity_check.name, customized_sanity_check.args, customized_sanity_check.kwargs))
            skip_sanity = customized_sanity_check.kwargs.get("skip_sanity", False)
            allow_recover = customized_sanity_check.kwargs.get("allow_recover", False)
            recover_method = customized_sanity_check.kwargs.get("recover_method", "adaptive")
            if allow_recover and recover_method not in constants.RECOVER_METHODS:
                pytest.warning("Unsupported recover method")
                logger.info("Fall back to use default recover method 'config_reload'")
                recover_method = "config_reload"
    
            pre_check_items = _update_check_items(
                pre_check_items,
                customized_sanity_check.kwargs.get("check_items", []),
                SUPPORTED_CHECKS)
    
            post_check = customized_sanity_check.kwargs.get("post_check", False)
    
        if request.config.option.skip_sanity:
            skip_sanity = True
        if skip_sanity:
            logger.info("Skip sanity check according to command line argument or configuration of test script.")
            yield
            return
    
        if request.config.option.allow_recover:
            allow_recover = True
    
        if request.config.option.recover_method:
            recover_method = request.config.getoption("--recover_method")
    
        if request.config.option.post_check:
            post_check = True
    
        cli_check_items = request.config.getoption("--check_items")
        cli_post_check_items = request.config.getoption("--post_check_items")
    
        if cli_check_items:
            logger.info('Fine tune pre-test check items based on CLI option --check_items')
            cli_items_list=str(cli_check_items).split(',')
            pre_check_items = _update_check_items(pre_check_items, cli_items_list, SUPPORTED_CHECKS)
    
        pre_check_items = filter_check_items(tbinfo, pre_check_items)  # Filter out un-supported checks.
    
        if post_check:
            # Prepare post test check items based on the collected pre test check items.
            post_check_items = copy.copy(pre_check_items)
            if customized_sanity_check:
                post_check_items = _update_check_items(
                    post_check_items,
                    customized_sanity_check.kwargs.get("post_check_items", []),
                    SUPPORTED_CHECKS)
    
            if cli_post_check_items:
                logger.info('Fine tune post-test check items based on CLI option --post_check_items')
                cli_post_items_list = str(cli_post_check_items).split(',')
                post_check_items = _update_check_items(post_check_items, cli_post_items_list, SUPPORTED_CHECKS)
    
            post_check_items = filter_check_items(tbinfo, post_check_items)  # Filter out un-supported checks.
        else:
            post_check_items = set()
    
        logger.info("Sanity check settings: skip_sanity=%s, pre_check_items=%s, allow_recover=%s, recover_method=%s, post_check=%s, post_check_items=%s" % \
            (skip_sanity, pre_check_items, allow_recover, recover_method, post_check, post_check_items))
    
        for item in pre_check_items.union(post_check_items):
            request.fixturenames.append(_item2fixture(item))
    
            # Workaround for pytest requirement.
            # Each possibly used check fixture must be executed in setup phase. Otherwise there could be teardown error.
            request.getfixturevalue(_item2fixture(item))
    
        if pre_check_items:
            logger.info("Start pre-test sanity checks")
    
            # Dynamically attach selected check fixtures to node
            for item in set(pre_check_items):
                request.fixturenames.append(_item2fixture(item))
    
            print_logs(duthosts)
    
            check_results = do_checks(request, pre_check_items, stage=STAGE_PRE_TEST)
            logger.debug("Pre-test sanity check results:\n%s" % json.dumps(check_results, indent=4, default=fallback_serializer))
    
            failed_results = [result for result in check_results if result['failed']]
            if failed_results:
                if not allow_recover:
                    pt_assert(False, "!!!!!!!!!!!!!!!!Pre-test sanity check failed: !!!!!!!!!!!!!!!!\n{}"\
                        .format(json.dumps(failed_results, indent=4, default=fallback_serializer)))
                else:
                    dut_failed_results = defaultdict(list)
                    infra_recovery_actions= []
                    for failed_result in failed_results:
                        if 'host' in failed_result:
                            dut_failed_results[failed_result['host']].append(failed_result)
                        if failed_result['check_item'] in constants.INFRA_CHECK_ITEMS:
                            if 'action' in failed_result and failed_result['action'] is not None \
                                and callable(failed_result['action']):
                                infra_recovery_actions.append(failed_result['action'])
                    for dut_name, dut_results in dut_failed_results.items():
                        # Attempt to restore neighbor VM state
                        neighbor_vm_restore(duthosts[dut_name], nbrhosts, tbinfo)
                        # Attempt to restore DUT state
                        recover(duthosts[dut_name], localhost, fanouthosts, dut_results, recover_method)
                    for action in infra_recovery_actions:
                        action()
    
                    logger.info("Run sanity check again after recovery")
                    new_check_results = do_checks(request, pre_check_items, stage=STAGE_PRE_TEST, after_recovery=True)
                    logger.debug("Pre-test sanity check after recovery results:\n%s" % json.dumps(new_check_results, indent=4, default=fallback_serializer))
    
                    new_failed_results = [result for result in new_check_results if result['failed']]
                    if new_failed_results:
                        pt_assert(False, "!!!!!!!!!!!!!!!! Pre-test sanity check after recovery failed: !!!!!!!!!!!!!!!!\n{}"\
>                           .format(json.dumps(new_failed_results, indent=4, default=fallback_serializer)))
E                       Failed: !!!!!!!!!!!!!!!! Pre-test sanity check after recovery failed: !!!!!!!!!!!!!!!!
E                       [
E                           {
E                               "check_item": "mux_simulator", 
E                               "failed": true, 
E                               "failed_reason": "Active side mismatch for mbr-vms17-7-14, got lower_tor but expected upper_tor", 
E                               "action": "<not serializable>"
E                           }
E                       ]

action     = <function _reset_simulator_port at 0x7f42eaa265d0>
allow_recover = True
check_results = [{'check_item': 'bgp', 'failed': False, 'host': 'str2-7050cx3-acs-06'}, {'check_item': 'bgp', 'failed': False, 'host':...ected upper_tor'}, {'check_item': 'interfaces', 'down_ports': [], 'failed': False, 'host': 'str2-7050cx3-acs-06'}, ...]
cli_check_items = False
cli_post_check_items = False
customized_sanity_check = None
dut_failed_results = defaultdict(<type 'list'>, {'str2-7050cx3-acs-06': [{'check_item': 'monit', 'f...r-log': u'Accessible', u'rsyslog': u'Running', u'routeCheck': u'Status ok'}}]})
dut_name   = 'str2-7050cx3-acs-06'
dut_results = [{'check_item': 'monit', 'failed': True, 'host': 'str2-7050cx3-acs-06', 'services_status': {'container_checker': 'Status failed', 'container_memory_telemetry': 'Status ok', 'diskCheck': 'Status ok', 'root-overlay': 'Accessible', ...}}]
duthosts   = <tests.common.devices.duthosts.DutHosts object at 0x7f42fa944e90>
failed_result = {'check_item': 'monit', 'failed': True, 'host': 'str2-7050cx3-acs-06', 'services_status': {'container_checker': 'Status failed', 'container_memory_telemetry': 'Status ok', 'diskCheck': 'Status ok', 'root-overlay': 'Accessible', ...}}
failed_results = [{'action': <function _reset_simulator_port at 0x7f42eaa265d0>, 'check_item': 'mux_simulator', 'failed': True, 'failed...atus failed', 'container_memory_telemetry': 'Status ok', 'diskCheck': 'Status ok', 'root-overlay': 'Accessible', ...}}]
fanouthosts = {'str2-7260cx3-acs-fan-10': { os: 'eos', hostname: 'str2-7260cx3-acs-fan-10', device_type: 'FanoutLeaf' }, 'str2-7260cx3-acs-fan-11': { os: 'eos', hostname: 'str2-7260cx3-acs-fan-11', device_type: 'FanoutLeaf' }}
infra_recovery_actions = [<function _reset_simulator_port at 0x7f42eaa265d0>]
item       = 'dbmemory'
localhost  = <tests.common.devices.local.Localhost object at 0x7f42f8e84d50>
m          = Mark(name='topology', args=('t0',), kwargs={})
nbrhosts   = {'ARISTA01T1': {'conf': {'bgp': {'asn': 64600, 'peers': {65100: ['10.0.0.56', 'FC00::71', '10.0.1.56', 'FC00::1:71']}}...'fc00::7e/126'}, ...}, 'properties': ['common']}, 'host': <tests.common.devices.eos.EosHost object at 0x7f42f17dc8d0>}}
new_check_results = [{'check_item': 'bgp', 'failed': False, 'host': 'str2-7050cx3-acs-06'}, {'check_item': 'bgp', 'failed': False, 'host':...ected upper_tor'}, {'check_item': 'interfaces', 'down_ports': [], 'failed': False, 'host': 'str2-7050cx3-acs-06'}, ...]
new_failed_results = [{'action': <function _reset_simulator_port at 0x7f42eaa265d0>, 'check_item': 'mux_simulator', 'failed': True, 'failed_reason': 'Active side mismatch for mbr-vms17-7-14, got lower_tor but expected upper_tor'}]
post_check = False
post_check_items = set([])
pre_check_items = set(['bgp', 'dbmemory', 'interfaces', 'monit', 'mux_simulator', 'processes', ...])
recover_method = 'adaptive'
request    = <SubRequest 'sanity_check' for <Function test_dscp_to_queue_during_decap_on_active>>
result     = {'check_item': 'dbmemory', 'failed': False, 'host': 'str2-7050cx3-acs-07'}
skip_sanity = False
tbinfo     = {'auto_recover': 'True', 'comment': 'hellogemini', 'conf-name': 'vms17-dual-t0-7050-1', 'duts': ['str2-7050cx3-acs-06', 'str2-7050cx3-acs-07'], ...}

common/plugins/sanity_check/__init__.py:259: Failed
@theasianpianist theasianpianist added the Dual ToR Platform ♊ Issues found on dual ToR platforms label Jul 16, 2021
qiluo-msft pushed a commit to qiluo-msft/sonic-buildimage that referenced this issue Jul 16, 2021
Update FRR to 7.5.1. The following is a list of new commits.
```
df7ab485b FRRouting Release 7.5.1
f4ed841b8 Merge pull request sonic-net#8187 from opensourcerouting/rpmfixes-75
86d5a20e3 Merge pull request sonic-net#8193 from mjstapp/fix_signals_7_5
b339cc149 lib: avoid signal-handling race with event loop poll call
0f7b432c3 lib: add debug output for signal mask
c0290c86d lib: add sigevent_check api
7a5348665 doc: Fix CentOS 7 Documentation
2a8e69f48 Merge pull request sonic-net#8064 from donaldsharp/foo
cf4d1a744 redhat: Fix changelog incorrect date format
b78dcb209 Merge pull request sonic-net#8181 from idryzhov/7.5-zebra-blackhole
2032e7e72 zebra: don't use kernel nexthops for blackhole routes
e52003567 bgpd: When deleting a neighbor from a peer-group the PGNAME is optional
aa86a6a6f Merge pull request sonic-net#8161 from mjstapp/fix_sa_7_5_backports
13a8efb4b Merge pull request sonic-net#8156 from idryzhov/7.5-backports-2021-02-26
58911c6ed lib: Free memory leak in error path in clippy
556dfd211 lib: use right type for wconv() return val
bd9caa8f1 lib: fix some misc SA warnings
683b3fe3f lib: register dependency between control plane protocol and vrf nb nodes
b45248fb6 lib: add definitions for vrf xpaths
7b9f10d04 lib: add ability to register dependencies between northbound nodes
9c240815c bgpd: Bgp peer group issue
d1b43634b bgpd: upon bgp deletion, do not systematically ask to remove main bgp
f5d1dc55e bgpd: Fix crash when we don't have a nexthop
c2e463478 frr-reload: rpki context exiting uses exit and not end
f11db1698 bgpd: Blackhole nexthops are not reachable
c628e94ff staticd: fix vrf enabling
49b079ef1 staticd: fix nexthop creation and installation
0077038e9 staticd: fix nexthop validation
be3dfbbc7 zebra: use AF_INET for protocol family
```
@theasianpianist theasianpianist self-assigned this Jul 16, 2021
lolyu pushed a commit to lolyu/sonic-buildimage that referenced this issue Jul 26, 2021
Update FRR to 7.5.1. The following is a list of new commits.
```
df7ab485b FRRouting Release 7.5.1
f4ed841b8 Merge pull request sonic-net#8187 from opensourcerouting/rpmfixes-75
86d5a20e3 Merge pull request sonic-net#8193 from mjstapp/fix_signals_7_5
b339cc149 lib: avoid signal-handling race with event loop poll call
0f7b432c3 lib: add debug output for signal mask
c0290c86d lib: add sigevent_check api
7a5348665 doc: Fix CentOS 7 Documentation
2a8e69f48 Merge pull request sonic-net#8064 from donaldsharp/foo
cf4d1a744 redhat: Fix changelog incorrect date format
b78dcb209 Merge pull request sonic-net#8181 from idryzhov/7.5-zebra-blackhole
2032e7e72 zebra: don't use kernel nexthops for blackhole routes
e52003567 bgpd: When deleting a neighbor from a peer-group the PGNAME is optional
aa86a6a6f Merge pull request sonic-net#8161 from mjstapp/fix_sa_7_5_backports
13a8efb4b Merge pull request sonic-net#8156 from idryzhov/7.5-backports-2021-02-26
58911c6ed lib: Free memory leak in error path in clippy
556dfd211 lib: use right type for wconv() return val
bd9caa8f1 lib: fix some misc SA warnings
683b3fe3f lib: register dependency between control plane protocol and vrf nb nodes
b45248fb6 lib: add definitions for vrf xpaths
7b9f10d04 lib: add ability to register dependencies between northbound nodes
9c240815c bgpd: Bgp peer group issue
d1b43634b bgpd: upon bgp deletion, do not systematically ask to remove main bgp
f5d1dc55e bgpd: Fix crash when we don't have a nexthop
c2e463478 frr-reload: rpki context exiting uses exit and not end
f11db1698 bgpd: Blackhole nexthops are not reachable
c628e94ff staticd: fix vrf enabling
49b079ef1 staticd: fix nexthop creation and installation
0077038e9 staticd: fix nexthop validation
be3dfbbc7 zebra: use AF_INET for protocol family
```
carl-nokia pushed a commit to carl-nokia/sonic-buildimage that referenced this issue Aug 7, 2021
Update FRR to 7.5.1. The following is a list of new commits.
```
df7ab485b FRRouting Release 7.5.1
f4ed841b8 Merge pull request sonic-net#8187 from opensourcerouting/rpmfixes-75
86d5a20e3 Merge pull request sonic-net#8193 from mjstapp/fix_signals_7_5
b339cc149 lib: avoid signal-handling race with event loop poll call
0f7b432c3 lib: add debug output for signal mask
c0290c86d lib: add sigevent_check api
7a5348665 doc: Fix CentOS 7 Documentation
2a8e69f48 Merge pull request sonic-net#8064 from donaldsharp/foo
cf4d1a744 redhat: Fix changelog incorrect date format
b78dcb209 Merge pull request sonic-net#8181 from idryzhov/7.5-zebra-blackhole
2032e7e72 zebra: don't use kernel nexthops for blackhole routes
e52003567 bgpd: When deleting a neighbor from a peer-group the PGNAME is optional
aa86a6a6f Merge pull request sonic-net#8161 from mjstapp/fix_sa_7_5_backports
13a8efb4b Merge pull request sonic-net#8156 from idryzhov/7.5-backports-2021-02-26
58911c6ed lib: Free memory leak in error path in clippy
556dfd211 lib: use right type for wconv() return val
bd9caa8f1 lib: fix some misc SA warnings
683b3fe3f lib: register dependency between control plane protocol and vrf nb nodes
b45248fb6 lib: add definitions for vrf xpaths
7b9f10d04 lib: add ability to register dependencies between northbound nodes
9c240815c bgpd: Bgp peer group issue
d1b43634b bgpd: upon bgp deletion, do not systematically ask to remove main bgp
f5d1dc55e bgpd: Fix crash when we don't have a nexthop
c2e463478 frr-reload: rpki context exiting uses exit and not end
f11db1698 bgpd: Blackhole nexthops are not reachable
c628e94ff staticd: fix vrf enabling
49b079ef1 staticd: fix nexthop creation and installation
0077038e9 staticd: fix nexthop validation
be3dfbbc7 zebra: use AF_INET for protocol family
```
@wangxin
Copy link
Contributor

wangxin commented Aug 16, 2021

Probably caused by this issue: #8484

@wangxin
Copy link
Contributor

wangxin commented Dec 1, 2021

There is no longer such issue after some fixes made to image and tests. Closing this issue.

@wangxin wangxin closed this as completed Dec 1, 2021
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Dual ToR Platform ♊ Issues found on dual ToR platforms
Projects
None yet
Development

No branches or pull requests

2 participants