From 1e12790a93e9a2fa49691d8759b940bf67430676 Mon Sep 17 00:00:00 2001 From: lguohan Date: Mon, 25 Jan 2021 21:13:06 -0800 Subject: [PATCH] [ceos]: check ceos testbed health via snmp and restart ceos if necessary (#2855) there are quite a few testbed failure causing kvm test fails. I login into those ceos devices and found management ip disappear although the configuration is good. restart the ceos container make the nbr healthy. as a mitigation efforts, this pr check the nbr health via snmp and restart the ceos testbed if necessary. Signed-off-by: Guohan Lu --- ansible/roles/eos/tasks/ceos.yml | 14 ++- ansible/roles/vm_set/tasks/add_ceos_list.yml | 1 + ansible/testbed-cli.sh | 5 +- ansible/testbed_add_vm_topology.yml | 1 + ansible/testbed_refresh_dut.yml | 90 -------------------- tests/kvmtest.sh | 2 +- 6 files changed, 19 insertions(+), 94 deletions(-) delete mode 100644 ansible/testbed_refresh_dut.yml diff --git a/ansible/roles/eos/tasks/ceos.yml b/ansible/roles/eos/tasks/ceos.yml index bb18d5bff45..c5b274b1a60 100644 --- a/ansible/roles/eos/tasks/ceos.yml +++ b/ansible/roles/eos/tasks/ceos.yml @@ -1,3 +1,15 @@ +- snmp_facts: host={{ ansible_host }} version=v2c is_eos=true community={{ snmp_rocommunity }} + delegate_to: localhost + register: snmp_data + ignore_errors: true + +- name: set force_restart=yes for ceos container + set_fact: force_restart=yes + +- name: set farce_restart=no for ceos container + set_fact: force_restart=no + when: snmp_data.ansible_facts.ansible_sysname is defined + - include_tasks: ceos_config.yml - name: Create cEOS container ceos_{{ vm_set_name }}_{{ inventory_hostname }} @@ -8,7 +20,7 @@ command: /sbin/init systemd.setenv=INTFTYPE=eth systemd.setenv=ETBA=1 systemd.setenv=SKIP_ZEROTOUCH_BARRIER_IN_SYSDBINIT=1 systemd.setenv=CEOS=1 systemd.setenv=EOS_PLATFORM=ceoslab systemd.setenv=container=docker systemd.setenv=MGMT_INTF=eth0 pull: no state: started - restart: yes + restart: "{{ force_restart }}" tty: yes network_mode: container:net_{{ vm_set_name }}_{{ inventory_hostname }} detach: True diff --git a/ansible/roles/vm_set/tasks/add_ceos_list.yml b/ansible/roles/vm_set/tasks/add_ceos_list.yml index c93c7619110..f421323088c 100644 --- a/ansible/roles/vm_set/tasks/add_ceos_list.yml +++ b/ansible/roles/vm_set/tasks/add_ceos_list.yml @@ -10,6 +10,7 @@ docker_image_info: name: - "{{ ceos_image_orig }}" + - "{{ ceos_image }}" become: yes register: ceos_stat diff --git a/ansible/testbed-cli.sh b/ansible/testbed-cli.sh index 361120c4d65..b19c94ab331 100755 --- a/ansible/testbed-cli.sh +++ b/ansible/testbed-cli.sh @@ -308,10 +308,11 @@ function refresh_dut ansible_options="-e sonic_vm_storage_location=$sonic_vm_dir" fi - ANSIBLE_SCP_IF_SSH=y ansible-playbook -vvv -i $vmfile testbed_refresh_dut.yml --vault-password-file="${passwd}" -l "$server" \ + ANSIBLE_SCP_IF_SSH=y ansible-playbook -i $vmfile testbed_add_vm_topology.yml --vault-password-file="${passwd}" -l "$server" \ -e topo_name="$topo_name" -e duts_name="$duts" -e VM_base="$vm_base" \ -e ptf_ip="$ptf_ip" -e topo="$topo" -e vm_set_name="$vm_set_name" \ - -e ptf_imagename="$ptf_imagename" -e ptf_ipv6="$ptf_ipv6" \ + -e ptf_imagename="$ptf_imagename" -e vm_type="$vm_type" -e ptf_ipv6="$ptf_ipv6" \ + -e force_stop_sonic_vm="yes" \ $ansible_options $@ echo Done diff --git a/ansible/testbed_add_vm_topology.yml b/ansible/testbed_add_vm_topology.yml index 23dc985f932..73f8d3e13e3 100644 --- a/ansible/testbed_add_vm_topology.yml +++ b/ansible/testbed_add_vm_topology.yml @@ -86,6 +86,7 @@ when: duts_name.split(',')|length > 1 roles: + - { role: vm_set, action: 'stop_sonic_vm', when force_stop_sonic_vm is defined } - { role: vm_set, action: 'start_sonic_vm' } - { role: vm_set, action: 'start_sid' } - { role: vm_set, action: 'add_topo' } diff --git a/ansible/testbed_refresh_dut.yml b/ansible/testbed_refresh_dut.yml deleted file mode 100644 index 36bda351671..00000000000 --- a/ansible/testbed_refresh_dut.yml +++ /dev/null @@ -1,90 +0,0 @@ -# This Playbook add refresh DUT in a topology -# -# Topologies are defined inside of vars/ directorie in files vars/topo_{{ topology_name}}.yml -# This file contains three structures: -# - topology -# - configuration property -# - configuration -# -# topology key contains a dictionary of hostnames with 'vm_offset' and 'vlans' keys in it. -# 'vm_offset' is used to map current hostname vm_set VM to server VM (like ARISTA01T0 -> VM0300). -# This offset is used on VM_base -# 'vlans' is a list of vlan offsets which helps us to calculate vlan numbers which will be connected to Eth1/1..Eth1/8 interfaces. -# These offsets are used with vlan_base -# -# Every topology should have a name to distinct one topology from another on the server -# Every topology contains a ptf container which will be used as placeholder for the injected interfaces from VMs, or direct connections to PTF host -# -# To add a topology please use following command -# ANSIBLE_SCP_IF_SSH=y ansible-playbook -i veos testbed_add_vm_topology.yml --vault-password-file=~/.password -l server_3 -e vm_set_name=first -e duts_name=str-msn2700-01 -e VM_base=VM0300 -e ptf_ip=10.255.0.255/23 -e topo=t0 -e ptf_imagename="docker_ptf" -# -# Parameters -# -l server_3 - this playbook have to be limited to run only on one server -# -e vm_set_name=first - the name of vm_set -# -e duts_name=str-msn2700-01 - the name of target dut -# -e VM_base=VM0300 - the VM name which is used to as base to calculate VM name for this set -# -e ptf_ip=10.255.0.255/23 - the ip address and prefix of ptf container mgmt interface -# -e ptf_ipv6=fec0::ffff:afa:1/64 - the ipv6 address and prefix of ptf container mgmt interface -# -e topo=t0 - the name of removed topo -# -e ptf_imagename=docker-ptf - name of a docker-image which will be used for the ptf docker container - -- hosts: servers:&vm_host - gather_facts: no - vars_files: - - vars/docker_registry.yml - pre_tasks: - - name: Check for a single host - fail: msg="Please use -l server_X to limit this playbook to one host" - when: "{{ play_hosts|length }} != 1" - - - name: Check that variable vm_set_name is defined - fail: msg="Define vm_set_name variable with -e vm_set_name=something" - when: vm_set_name is not defined - - - name: Check that variable duts_name is defined - fail: msg="Define duts_name variable with -e duts_name=something" - when: duts_name is not defined - - - name: Check that variable VM_base is defined - fail: msg="Define VM_base variable with -e VM_base=something" - when: VM_base is not defined - - - name: Check that variable ptf_ip is defined - fail: msg="Define ptf ip variable with -e ptf_ip=something" - when: ptf_ip is not defined - - - name: Check that variable ptf_ipv6 is defined - fail: msg="Define ptf ipv6 variable with -e ptf_ipv6=something" - when: ptf_ipv6 is not defined - - - name: Check that variable topo is defined - fail: msg="Define topo variable with -e topo=something" - when: topo is not defined - - - name: Check if it is a known topology - fail: msg="Unknown topology {{ topo }}" - when: topo not in topologies - - - name: Check that variable ptf_imagename is defined - fail: msg="Define ptf_imagename variable with -e ptf_imagename=something" - when: ptf_imagename is not defined - - - name: Load topo variables - include_vars: "vars/topo_{{ topo }}.yml" - - - name: Read dut minigraph - conn_graph_facts: - host: "{{ duts_name }}" - delegate_to: localhost - when: duts_name.split(',')|length == 1 - - - name: Read duts minigraph - conn_graph_facts: - hosts: "{{ duts_name.split(',') }}" - delegate_to: localhost - when: duts_name.split(',')|length > 1 - - roles: - - { role: vm_set, action: 'stop_sonic_vm' } - - { role: vm_set, action: 'start_sonic_vm' } - - { role: vm_set, action: 'add_topo' } diff --git a/tests/kvmtest.sh b/tests/kvmtest.sh index 3a34fea2173..ec6b35a3a9c 100755 --- a/tests/kvmtest.sh +++ b/tests/kvmtest.sh @@ -92,7 +92,7 @@ fi pushd $SONIC_MGMT_DIR/ansible if [ -n "$refresh_dut" ]; then # Refresh dut in the virtual switch topology - ./testbed-cli.sh -m $inventory -t $testbed_file refresh-dut $tbname password.txt + ./testbed-cli.sh -m $inventory -t $testbed_file -k ceos refresh-dut $tbname password.txt sleep 120 fi