Skip to content

Commit

Permalink
[202012] Improve the cleanup of PTF container (#10533)
Browse files Browse the repository at this point in the history
* Improve the cleanup of processes and interfaces before stopping PTF container (#10069)

What is the motivation for this PR?
We still observe issues with "testbed-cli.sh remove-topo" and "testbed-cli.sh restart-ptf":

Server may crash and run into CPU softlock issue.
Some exabgp process cannot be fully stopped and "restart-ptf" may fail.
The expectation is that remove-topo and restart-ptf can always be successful. And of course, no server crash.
Possible reason of server crash:

Some exabgp processes are still running in PTF container while we remove the container. This could cause server crash.
Some network interfaces are in the PTF container's network namespace while we remove the container.

How did you do it?
Added a customized module "ptf_control" to stop&kill processes running in PTF container in a more aggressive and reliable way.
Improve the vm_topology module to remove network interfaces from the PTF container in the "unbind" procedure.
Added a vm_topology "unbind" step in the "testbed-cli.sh restart-ptf" procedure.
Updated some "ip link" commands to fully compliant with the syntax in "ip link help".

How did you verify/test it?
Tested the add-topo/remove-topo on both physical and KVM testbed.
Tested restart-ptf on phsycial testbed.

* Avoid running command in exited ptf docker container (#10286)

While stopping PTF container, "ptf_control" module is executed to
kill all processes in the PTF container.
The original code checks if the PTF container's Pid exists before
running command in the PTF container. Unfortunately, this check
is not enough. PTF docker container in exited status still has Pid.

This change improved the code for getting PTF container's Pid.
When PTF container is not in "running" status, always return None
for PTF container's Pid.

Signed-off-by: Xin Wang <xiwang5@microsoft.com>

* Install right version of docker.py for ubuntu22.04

---------

Signed-off-by: Xin Wang <xiwang5@microsoft.com>
  • Loading branch information
wangxin authored Nov 9, 2023
1 parent 70ad8b3 commit 4473f6d
Show file tree
Hide file tree
Showing 6 changed files with 250 additions and 134 deletions.
141 changes: 141 additions & 0 deletions ansible/roles/vm_set/library/ptf_control.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/usr/bin/python

import json
import logging
import traceback

import docker

from ansible.module_utils.debug_utils import config_module_logging
from ansible.module_utils.basic import AnsibleModule

DOCUMENTATION = '''
---
module: ptf_control
version_added: "0.1"
author: Xin Wang (xiwang5@microsoft.com)
short_description: Control PTF container
description: For controlling PTF container, for example killing processes running in PTF container before stopping it.
Parameters:
- ctn_name: Name of the PTF container
- command: Command to run, currently only support "kill"
'''

EXAMPLES = '''
- name: Kill exabgp and ptf_nn_agent processes in PTF container
ptf_control:
ctn_name: "ptf_vms6-1"
command: kill
'''


class PtfControl(object):
"""This class is for controlling PTF container
"""

def __init__(self, module, ctn_name):
self.module = module
self.ctn_name = ctn_name

self.pid = PtfControl.get_pid(self.ctn_name)

def cmd(self, cmdline, use_unsafe_shell=False, ignore_failure=False, verbose=True):
rc, out, err = self.module.run_command(cmdline, use_unsafe_shell=use_unsafe_shell)
if verbose:
msg = {
'cmd': cmdline,
'rc': rc,
'stdout_lines': out.splitlines(),
'stderr_lines': err.splitlines()
}
logging.debug('***** RUN CMD:\n%s' % json.dumps(msg, indent=2))

if rc != 0 and not ignore_failure:
raise Exception("Failed to run command: %s, rc=%d, out=%s, err=%s" % (cmdline, rc, out, err))
return rc, out, err

@staticmethod
def get_pid(ctn_name):
cli = docker.from_env()
try:
ctn = cli.containers.get(ctn_name)
if ctn.status == 'running':
return ctn.attrs['State']['Pid']
except Exception as e:
logging.debug("Failed to get pid for container %s: %s" % (ctn_name, str(e)))

return None

def get_process_pids(self, process):
cmd = 'docker exec -t {} bash -c "pgrep -f \'{}\'"'.format(self.ctn_name, process)
_, out, _ = self.cmd(cmd, ignore_failure=True)
return [int(pid.strip()) for pid in out.splitlines()]

def get_supervisord_processes(self):
_, out, _ = self.cmd(
'docker exec -t {} bash -c "supervisorctl status"'.format(self.ctn_name), ignore_failure=True
)
processes = [line.strip().split()[0] for line in out.splitlines() if "sshd" not in line]
return processes

def kill_process(self, pid):
self.cmd('docker exec -t {} bash -c "kill -9 {}"'.format(self.ctn_name, pid), ignore_failure=True)

def kill_processes(self):
supervisord_processes = self.get_supervisord_processes()
self.cmd('docker exec -t {} bash -c "ps -ef"'.format(self.ctn_name))
for i in range(3):
logging.info("=== Attempt %d ===" % (i + 1))
logging.info("=== Use supervisorctl to stop processes ===")
for process in supervisord_processes:
self.cmd(
'docker exec -t {} bash -c "supervisorctl stop {}"'.format(self.ctn_name, process),
ignore_failure=True
)
self.cmd(
'docker exec -t {} bash -c "ps -ef"'.format(self.ctn_name)
)

for pattern in [
"/usr/share/exabgp/http_api.py",
"/usr/local/bin/exabgp",
"ptf_nn_agent.py"
]:
logging.info("=== Kill process %s ===" % pattern)
for pid in self.get_process_pids(pattern):
self.kill_process(pid)

self.cmd('docker exec -t {} bash -c "ps -ef"'.format(self.ctn_name))


def main():
module = AnsibleModule(
argument_spec=dict(
ctn_name=dict(required=True, type='str'),
command=dict(required=True, type='str')
),
supports_check_mode=False)

ctn_name = module.params['ctn_name']
command = module.params['command']
if command not in ['kill']:
module.fail_json(msg="command %s is not supported" % command)

config_module_logging('ptf_control_' + ctn_name)

try:
ptf = PtfControl(module, ctn_name)
if command == "kill":
if ptf.pid is not None:
ptf.kill_processes()
except Exception as error:
logging.error(traceback.format_exc())
module.fail_json(msg=str(error))

module.exit_json(changed=True)


if __name__ == "__main__":
main()
90 changes: 71 additions & 19 deletions ansible/roles/vm_set/library/vm_topology.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,8 @@ def add_br_if_to_docker(self, bridge, ext_if, int_if):

VMTopology.iface_up(ext_if)

if VMTopology.intf_exists(tmp_int_if) and VMTopology.intf_not_exists(tmp_int_if, self.pid):
VMTopology.cmd("ip link set netns %s dev %s" % (self.pid, tmp_int_if))
if VMTopology.intf_exists(tmp_int_if) and VMTopology.intf_not_exists(tmp_int_if, pid=self.pid):
VMTopology.cmd("ip link set dev %s netns %s " % (tmp_int_if, self.pid))
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" % (self.pid, tmp_int_if, int_if))

VMTopology.iface_up(int_if, self.pid)
Expand All @@ -443,11 +443,11 @@ def add_ip_to_docker_if(self, int_if, mgmt_ip_addr, mgmt_ipv6_addr=None, mgmt_gw
def add_dut_if_to_docker(self, iface_name, dut_iface):
logging.info("=== Add DUT interface %s to PTF docker as %s ===" % (dut_iface, iface_name))
if VMTopology.intf_exists(dut_iface) \
and VMTopology.intf_not_exists(dut_iface, self.pid) \
and VMTopology.intf_not_exists(iface_name, self.pid):
VMTopology.cmd("ip link set netns %s dev %s" % (self.pid, dut_iface))
and VMTopology.intf_not_exists(dut_iface, pid=self.pid) \
and VMTopology.intf_not_exists(iface_name, pid=self.pid):
VMTopology.cmd("ip link set dev %s netns %s" % (dut_iface, self.pid))

if VMTopology.intf_exists(dut_iface, self.pid) and VMTopology.intf_not_exists(iface_name, self.pid):
if VMTopology.intf_exists(dut_iface, pid=self.pid) and VMTopology.intf_not_exists(iface_name, pid=self.pid):
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" % (self.pid, dut_iface, iface_name))

VMTopology.iface_up(iface_name, self.pid)
Expand All @@ -461,7 +461,7 @@ def add_dut_vlan_subif_to_docker(self, iface_name, vlan_separator, vlan_id):
VMTopology.cmd("nsenter -t %s -n ip link set %s up" % (self.pid, vlan_sub_iface_name))

def remove_dut_if_from_docker(self, iface_name, dut_iface):

logging.info("=== Restore docker interface %s as dut interface %s ===" % (iface_name, dut_iface))
if self.pid is None:
return

Expand All @@ -471,17 +471,20 @@ def remove_dut_if_from_docker(self, iface_name, dut_iface):
if VMTopology.intf_not_exists(dut_iface, self.pid):
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" % (self.pid, iface_name, dut_iface))

if VMTopology.intf_not_exists(dut_iface) and VMTopology.intf_exists(dut_iface, self.pid):
VMTopology.cmd("nsenter -t %s -n ip link set netns 1 dev %s" % (self.pid, dut_iface))
if VMTopology.intf_not_exists(dut_iface) and VMTopology.intf_exists(dut_iface, pid=self.pid):
VMTopology.cmd(
"nsenter -t %s -n ip link set dev %s netns 1" % (self.pid, dut_iface))

def remove_dut_vlan_subif_from_docker(self, iface_name, vlan_separator, vlan_id):
"""Remove the vlan sub interface created for the ptf interface."""
if self.pid is None:
return

vlan_sub_iface_name = iface_name + vlan_separator + vlan_id
if VMTopology.intf_exists(vlan_sub_iface_name, self.pid):
VMTopology.cmd("nsenter -t %s -n ip link del %s" % (self.pid, vlan_sub_iface_name))
if VMTopology.intf_exists(vlan_sub_iface_name, pid=self.pid):
VMTopology.iface_down(vlan_sub_iface_name, pid=self.pid)
VMTopology.cmd("nsenter -t %s -n ip link del %s" %
(self.pid, vlan_sub_iface_name))

def add_veth_if_to_docker(self, ext_if, int_if, create_vlan_subintf=False, **kwargs):
"""Create vethernet devices (ext_if, int_if) and put int_if into the ptf docker."""
Expand Down Expand Up @@ -526,14 +529,16 @@ def add_veth_if_to_docker(self, ext_if, int_if, create_vlan_subintf=False, **kwa
VMTopology.iface_up(ext_if)

if VMTopology.intf_exists(t_int_if) \
and VMTopology.intf_not_exists(t_int_if, self.pid) \
and VMTopology.intf_not_exists(int_if, self.pid):
VMTopology.cmd("ip link set netns %s dev %s" % (self.pid, t_int_if))
and VMTopology.intf_not_exists(t_int_if, pid=self.pid) \
and VMTopology.intf_not_exists(int_if, pid=self.pid):
VMTopology.cmd("ip link set dev %s netns %s" %
(t_int_if, self.pid))
if create_vlan_subintf \
and VMTopology.intf_exists(t_int_sub_if) \
and VMTopology.intf_not_exists(t_int_sub_if, self.pid) \
and VMTopology.intf_not_exists(int_sub_if, self.pid):
VMTopology.cmd("ip link set netns %s dev %s" % (self.pid, t_int_sub_if))
and VMTopology.intf_exists(t_int_sub_if) \
and VMTopology.intf_not_exists(t_int_sub_if, pid=self.pid) \
and VMTopology.intf_not_exists(int_sub_if, pid=self.pid):
VMTopology.cmd("ip link set dev %s netns %s" %
(t_int_sub_if, self.pid))

if VMTopology.intf_exists(t_int_if, self.pid) and VMTopology.intf_not_exists(int_if, self.pid):
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" % (self.pid, t_int_if, int_if))
Expand All @@ -544,7 +549,7 @@ def add_veth_if_to_docker(self, ext_if, int_if, create_vlan_subintf=False, **kwa

VMTopology.iface_up(int_if, self.pid)
if create_vlan_subintf:
VMTopology.iface_up(int_sub_if, self.pid)
VMTopology.iface_up(int_sub_if, pid=self.pid)

def bind_mgmt_port(self, br_name, mgmt_port):
logging.info('=== Bind mgmt port %s to bridge %s ===' % (mgmt_port, br_name))
Expand Down Expand Up @@ -589,6 +594,7 @@ def bind_fp_ports(self, disconnect_vm=False):
self.bind_vs_dut_ports(VS_CHASSIS_MIDPLANE_BRIDGE_NAME, self.topo['DUT']['vs_chassis']['midplane_port'])

def unbind_fp_ports(self):
logging.info("=== unbind front panel ports ===")
for attr in self.VMs.values():
for vlan_num, vlan in enumerate(attr['vlans']):
br_name = adaptive_name(OVS_FP_BRIDGE_TEMPLATE, self.vm_names[self.vm_base_index + attr['vm_offset']], vlan_num)
Expand Down Expand Up @@ -847,6 +853,7 @@ def remove_host_ports(self):
"""
remove dut port from the ptf docker
"""
logging.info("=== Remove host ports ===")
for i, intf in enumerate(self.host_interfaces):
if self._is_multi_duts:
if isinstance(intf, list):
Expand All @@ -867,6 +874,45 @@ def remove_host_ports(self):
vlan_id = self.vlan_ids[str(intf)]
self.remove_dut_vlan_subif_from_docker(ptf_if, vlan_separator, vlan_id)

def remove_veth_if_from_docker(self, ext_if, int_if, tmp_name):
"""
Remove veth interface from docker
"""
logging.info("=== Cleanup port, int_if: %s, ext_if: %s, tmp_name: %s ===" % (ext_if, int_if, tmp_name))
if VMTopology.intf_exists(int_if, pid=self.pid):
# Name it back to temp name in PTF container to avoid potential conflicts
VMTopology.iface_down(int_if, pid=self.pid)
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" % (self.pid, int_if, tmp_name))
# Set it to default namespace
VMTopology.cmd("nsenter -t %s -n ip link set dev %s netns 1" % (self.pid, tmp_name))

# Delete its peer in default namespace
if VMTopology.intf_exists(ext_if):
VMTopology.cmd("ip link delete dev %s" % ext_if)

def remove_ptf_mgmt_port(self):
ext_if = PTF_MGMT_IF_TEMPLATE % self.vm_set_name
tmp_name = MGMT_PORT_NAME + VMTopology._generate_fingerprint(ext_if, MAX_INTF_LEN-len(MGMT_PORT_NAME))
self.remove_veth_if_from_docker(ext_if, MGMT_PORT_NAME, tmp_name)

def remove_ptf_backplane_port(self):
ext_if = PTF_BP_IF_TEMPLATE % self.vm_set_name
tmp_name = BP_PORT_NAME + VMTopology._generate_fingerprint(ext_if, MAX_INTF_LEN-len(BP_PORT_NAME))
self.remove_veth_if_from_docker(ext_if, BP_PORT_NAME, tmp_name)

def remove_injected_fp_ports_from_docker(self):
for vm, vlans in self.injected_fp_ports.items():
for vlan in vlans:
(_, _, ptf_index) = VMTopology.parse_vm_vlan_port(vlan)
ext_if = adaptive_name(INJECTED_INTERFACES_TEMPLATE, self.vm_set_name, ptf_index)
int_if = PTF_FP_IFACE_TEMPLATE % ptf_index
properties = self.vm_properties.get(vm, {})
create_vlan_subintf = properties.get('device_type') in (
BACKEND_TOR_TYPE, BACKEND_LEAF_TYPE)
if not create_vlan_subintf:
tmp_name = int_if + VMTopology._generate_fingerprint(ext_if, MAX_INTF_LEN-len(int_if))
self.remove_veth_if_from_docker(ext_if, int_if, tmp_name)

@staticmethod
def _generate_fingerprint(name, digit=6):
"""
Expand Down Expand Up @@ -1363,9 +1409,14 @@ def main():
if vm_type != "vsonic":
net.unbind_vm_backplane()
net.unbind_fp_ports()
net.remove_injected_fp_ports_from_docker()

if hostif_exists:
net.remove_host_ports()

net.remove_ptf_mgmt_port()
net.remove_ptf_backplane_port()

elif cmd == 'renumber':
check_params(module, ['vm_set_name',
'topo',
Expand Down Expand Up @@ -1412,6 +1463,7 @@ def main():
net.unbind_fp_ports()
net.add_injected_fp_ports_to_docker()
net.bind_fp_ports()
net.bind_vm_backplane()
net.add_bp_port_to_docker(ptf_bp_ip_addr, ptf_bp_ipv6_addr)

if hostif_exists:
Expand Down
Loading

0 comments on commit 4473f6d

Please sign in to comment.