Skip to content

Commit

Permalink
Merge branch 'master' into feature/caclmgrd_external_client
Browse files Browse the repository at this point in the history
  • Loading branch information
prsunny authored Aug 25, 2022
2 parents bd7b172 + f9af7ae commit d992dc0
Show file tree
Hide file tree
Showing 5 changed files with 395 additions and 31 deletions.
68 changes: 51 additions & 17 deletions scripts/determine-reboot-cause
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,12 @@ def find_hardware_reboot_cause():
else:
sonic_logger.log_info("No reboot cause found from platform api")

hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)
hardware_reboot_cause_minor_str = ""
if hardware_reboot_cause_minor:
hardware_reboot_cause_minor_str = " ({})".format(hardware_reboot_cause_minor)

hardware_reboot_cause = hardware_reboot_cause_major + hardware_reboot_cause_minor_str

return hardware_reboot_cause


Expand Down Expand Up @@ -158,6 +163,50 @@ def get_reboot_cause_dict(previous_reboot_cause, comment, gen_time):

return reboot_cause_dict

def determine_reboot_cause():
# This variable is kept for future-use purpose. When proc_cmd_line/vendor/software provides
# any additional_reboot_info it will be stored as a "comment" in REBOOT_CAUSE_HISTORY_FILE
additional_reboot_info = "N/A"

# 1. Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline
proc_cmdline_reboot_cause = find_proc_cmdline_reboot_cause()

# 2. Check if the previous reboot was caused by hardware
# If yes, the hardware reboot cause will be treated as the reboot cause
hardware_reboot_cause = find_hardware_reboot_cause()

# 3. If there is a REBOOT_CAUSE_FILE, it will contain any software-related
# reboot info. We will use it as the previous cause.
software_reboot_cause = find_software_reboot_cause()

# The main decision logic of the reboot cause:
# If there is a valid hardware reboot cause indicated by platform API,
# check the software reboot cause to add additional rebot cause.
# If there is a reboot cause indicated by /proc/cmdline, and/or warmreboot/fastreboot/softreboot
# the software_reboot_cause which is the content of /hosts/reboot-cause/reboot-cause.txt
# will be treated as the additional reboot cause
# Elif there is a cmdline reboot cause,
# the software_reboot_cause will be treated as the reboot cause if it's not unknown
# otherwise, the cmdline_reboot_cause will be treated as the reboot cause if it's not none
# Else the software_reboot_cause will be treated as the reboot cause
if REBOOT_CAUSE_NON_HARDWARE not in hardware_reboot_cause:
previous_reboot_cause = hardware_reboot_cause
# Check if any software reboot was issued before this hardware reboot happened
if software_reboot_cause is not REBOOT_CAUSE_UNKNOWN:
additional_reboot_info = software_reboot_cause
elif proc_cmdline_reboot_cause is not None:
additional_reboot_info = proc_cmdline_reboot_cause
elif proc_cmdline_reboot_cause is not None:
if software_reboot_cause is not REBOOT_CAUSE_UNKNOWN:
# Get the reboot cause from REBOOT_CAUSE_FILE
previous_reboot_cause = software_reboot_cause
else:
previous_reboot_cause = proc_cmdline_reboot_cause
else:
previous_reboot_cause = software_reboot_cause

return previous_reboot_cause, additional_reboot_info


def main():
# Configure logger to log all messages INFO level and higher
Expand All @@ -177,22 +226,7 @@ def main():
if os.path.exists(PREVIOUS_REBOOT_CAUSE_FILE):
os.remove(PREVIOUS_REBOOT_CAUSE_FILE)

# This variable is kept for future-use purpose. When proc_cmd_line/vendor/software provides
# any additional_reboot_info it will be stored as a "comment" in REBOOT_CAUSE_HISTORY_FILE
additional_reboot_info = "N/A"

# Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline
proc_cmdline_reboot_cause = find_proc_cmdline_reboot_cause()

# If /proc/cmdline does not indicate reboot cause, check if the previous reboot was caused by hardware
if proc_cmdline_reboot_cause is None:
previous_reboot_cause = find_hardware_reboot_cause()
if previous_reboot_cause.startswith(REBOOT_CAUSE_NON_HARDWARE):
# If the reboot cause is non-hardware, get the reboot cause from REBOOT_CAUSE_FILE
previous_reboot_cause = find_software_reboot_cause()
else:
# Get the reboot cause from REBOOT_CAUSE_FILE
previous_reboot_cause = find_software_reboot_cause()
previous_reboot_cause, additional_reboot_info = determine_reboot_cause()

# Current time
reboot_cause_gen_time = str(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S'))
Expand Down
173 changes: 166 additions & 7 deletions scripts/hostcfgd
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import re
import jinja2
from sonic_py_common import device_info
from swsscommon.swsscommon import ConfigDBConnector, DBConnector, Table
from swsscommon import swsscommon

# FILE
PAM_AUTH_CONF = "/etc/pam.d/common-auth-sonic"
Expand Down Expand Up @@ -1253,6 +1254,143 @@ class PamLimitsCfg(object):
"modify pam_limits config file failed with exception: {}"
.format(e))

class DeviceMetaCfg(object):
"""
DeviceMetaCfg Config Daemon
Handles changes in DEVICE_METADATA table.
1) Handle hostname change
"""

def __init__(self):
self.hostname = ''

def load(self, dev_meta={}):
# Get hostname initial
self.hostname = dev_meta.get('localhost', {}).get('hostname', '')
syslog.syslog(syslog.LOG_DEBUG, f'Initial hostname: {self.hostname}')

def hostname_update(self, data):
"""
Apply hostname handler.
Args:
data: Read table's key's data.
"""
syslog.syslog(syslog.LOG_DEBUG, 'DeviceMetaCfg: hostname update')
new_hostname = data.get('hostname')

# Restart hostname-config service when hostname was changed.
# Empty not allowed
if new_hostname and new_hostname != self.hostname:
syslog.syslog(syslog.LOG_INFO, 'DeviceMetaCfg: Set new hostname: {}'
.format(new_hostname))
self.hostname = new_hostname
try:
run_cmd('sudo service hostname-config restart', True, True)
except subprocess.CalledProcessError as e:
syslog.syslog(syslog.LOG_ERR, 'DeviceMetaCfg: Failed to set new'
' hostname: {}'.format(e))
return

run_cmd('sudo monit reload')
else:
msg = 'Hostname was not updated: '
msg += 'Already set up' if new_hostname else 'Empty not allowed'
syslog.syslog(syslog.LOG_ERR, msg)


class MgmtIfaceCfg(object):
"""
MgmtIfaceCfg Config Daemon
Handles changes in MGMT_INTERFACE, MGMT_VRF_CONFIG tables.
1) Handle change of interface ip
2) Handle change of management VRF state
"""

def __init__(self):
self.iface_config_data = {}
self.mgmt_vrf_enabled = ''

def load(self, mgmt_iface={}, mgmt_vrf={}):
# Get initial data
self.iface_config_data = mgmt_iface
self.mgmt_vrf_enabled = mgmt_vrf.get('mgmtVrfEnabled', '')
syslog.syslog(syslog.LOG_DEBUG,
f'Initial mgmt interface conf: {self.iface_config_data}')
syslog.syslog(syslog.LOG_DEBUG,
f'Initial mgmt VRF state: {self.mgmt_vrf_enabled}')

def update_mgmt_iface(self, iface, key, data):
"""Handle update management interface config
"""
syslog.syslog(syslog.LOG_DEBUG, 'MgmtIfaceCfg: mgmt iface update')

# Restart management interface service when config was changed
if data != self.iface_config_data.get(key):
cfg = {key: data}
syslog.syslog(syslog.LOG_INFO, f'MgmtIfaceCfg: Set new interface '
f'config {cfg} for {iface}')
try:
run_cmd('sudo systemctl restart interfaces-config', True, True)
run_cmd('sudo systemctl restart ntp-config', True, True)
except subprocess.CalledProcessError:
syslog.syslog(syslog.LOG_ERR, f'Failed to restart management '
'interface services')
return

self.iface_config_data[key] = data

def update_mgmt_vrf(self, data):
"""Handle update management VRF state
"""
syslog.syslog(syslog.LOG_DEBUG, 'MgmtIfaceCfg: mgmt vrf state update')

# Restart mgmt vrf services when mgmt vrf config was changed.
# Empty not allowed.
enabled = data.get('mgmtVrfEnabled', '')
if not enabled or enabled == self.mgmt_vrf_enabled:
return

syslog.syslog(syslog.LOG_INFO, f'Set mgmt vrf state {enabled}')

# Restart related vrfs services
try:
run_cmd('service ntp stop', True, True)
run_cmd('systemctl restart interfaces-config', True, True)
run_cmd('service ntp start', True, True)
except subprocess.CalledProcessError:
syslog.syslog(syslog.LOG_ERR, f'Failed to restart management vrf '
'services')
return

# Remove mgmt if route
if enabled == 'true':
"""
The regular expression for grep in below cmd is to match eth0 line
in /proc/net/route, sample file:
$ cat /proc/net/route
Iface Destination Gateway Flags RefCnt Use
eth0 00000000 01803B0A 0003 0 0
#################### Line break here ####################
Metric Mask MTU Window IRTT
202 00000000 0 0 0
"""
try:
run_cmd(r"""cat /proc/net/route | grep -E \"eth0\s+"""
r"""00000000\s+[0-9A-Z]+\s+[0-9]+\s+[0-9]+\s+[0-9]+"""
r"""\s+202\" | wc -l""",
True, True)
except subprocess.CalledProcessError:
syslog.syslog(syslog.LOG_ERR, 'MgmtIfaceCfg: Could not delete '
'eth0 route')
return

run_cmd("ip -4 route del default dev eth0 metric 202", False)

# Update cache
self.mgmt_vrf_enabled = enabled


class HostConfigDaemon:
def __init__(self):
# Just a sanity check to verify if the CONFIG_DB has been initialized
Expand Down Expand Up @@ -1284,7 +1422,6 @@ class HostConfigDaemon:
self.is_multi_npu = device_info.is_multi_npu()

# Initialize AAACfg
self.hostname_cache=""
self.aaacfg = AaaCfg()

# Initialize PasswHardening
Expand All @@ -1294,6 +1431,12 @@ class HostConfigDaemon:
self.pamLimitsCfg = PamLimitsCfg(self.config_db)
self.pamLimitsCfg.update_config_file()

# Initialize DeviceMetaCfg
self.devmetacfg = DeviceMetaCfg()

# Initialize MgmtIfaceCfg
self.mgmtifacecfg = MgmtIfaceCfg()

def load(self, init_data):
features = init_data['FEATURE']
aaa = init_data['AAA']
Expand All @@ -1306,21 +1449,21 @@ class HostConfigDaemon:
ntp_global = init_data['NTP']
kdump = init_data['KDUMP']
passwh = init_data['PASSW_HARDENING']
dev_meta = init_data.get(swsscommon.CFG_DEVICE_METADATA_TABLE_NAME, {})
mgmt_ifc = init_data.get(swsscommon.CFG_MGMT_INTERFACE_TABLE_NAME, {})
mgmt_vrf = init_data.get(swsscommon.CFG_MGMT_VRF_CONFIG_TABLE_NAME, {})

self.feature_handler.sync_state_field(features)
self.aaacfg.load(aaa, tacacs_global, tacacs_server, radius_global, radius_server)
self.iptables.load(lpbk_table)
self.ntpcfg.load(ntp_global, ntp_server)
self.kdumpCfg.load(kdump)
self.passwcfg.load(passwh)

dev_meta = self.config_db.get_table('DEVICE_METADATA')
if 'localhost' in dev_meta:
if 'hostname' in dev_meta['localhost']:
self.hostname_cache = dev_meta['localhost']['hostname']
self.devmetacfg.load(dev_meta)
self.mgmtifacecfg.load(mgmt_ifc, mgmt_vrf)

# Update AAA with the hostname
self.aaacfg.hostname_update(self.hostname_cache)
self.aaacfg.hostname_update(self.devmetacfg.hostname)

def __get_intf_name(self, key):
if isinstance(key, tuple) and key:
Expand Down Expand Up @@ -1370,6 +1513,10 @@ class HostConfigDaemon:
mgmt_intf_name = self.__get_intf_name(key)
self.aaacfg.handle_radius_source_intf_ip_chg(mgmt_intf_name)
self.aaacfg.handle_radius_nas_ip_chg(mgmt_intf_name)
self.mgmtifacecfg.update_mgmt_iface(mgmt_intf_name, key, data)

def mgmt_vrf_handler(self, key, op, data):
self.mgmtifacecfg.update_mgmt_vrf(data)

def lpbk_handler(self, key, op, data):
key = ConfigDBConnector.deserialize_key(key)
Expand Down Expand Up @@ -1409,6 +1556,10 @@ class HostConfigDaemon:
syslog.syslog(syslog.LOG_INFO, 'Kdump handler...')
self.kdumpCfg.kdump_update(key, data)

def device_metadata_handler(self, key, op, data):
syslog.syslog(syslog.LOG_INFO, 'DeviceMeta handler...')
self.devmetacfg.hostname_update(data)

def wait_till_system_init_done(self):
# No need to print the output in the log file so using the "--quiet"
# flag
Expand Down Expand Up @@ -1448,6 +1599,14 @@ class HostConfigDaemon:
self.config_db.subscribe('PORTCHANNEL_INTERFACE', make_callback(self.portchannel_intf_handler))
self.config_db.subscribe('INTERFACE', make_callback(self.phy_intf_handler))

# Handle DEVICE_MEATADATA changes
self.config_db.subscribe(swsscommon.CFG_DEVICE_METADATA_TABLE_NAME,
make_callback(self.device_metadata_handler))

# Handle MGMT_VRF_CONFIG changes
self.config_db.subscribe(swsscommon.CFG_MGMT_VRF_CONFIG_TABLE_NAME,
make_callback(self.mgmt_vrf_handler))

syslog.syslog(syslog.LOG_INFO,
"Waiting for systemctl to finish initialization")
self.wait_till_system_init_done()
Expand Down
Loading

0 comments on commit d992dc0

Please sign in to comment.