Skip to content

Commit

Permalink
[auto-ts] add memory check (#2116)
Browse files Browse the repository at this point in the history
- What I did
Implemented memory threashold check in auto techsupport feature according to sonic-net/SONiC#939.

- How I did it
Added two scripts. The check script and the handler script. Few modifications made in auto tech implementation. UT added.

- How to verify it
Run the action script and the handler script on the switch. Run UT.

Signed-off-by: Stepan Blyschak <stepanb@nvidia.com>
  • Loading branch information
stepanblyschak committed May 14, 2022
1 parent b370290 commit aa81b97
Show file tree
Hide file tree
Showing 14 changed files with 673 additions and 135 deletions.
60 changes: 58 additions & 2 deletions config/plugins/auto_techsupport.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,50 @@ def AUTO_TECHSUPPORT_GLOBAL_max_core_limit(db, max_core_limit):
exit_with_error(f"Error: {err}", fg="red")


@AUTO_TECHSUPPORT_GLOBAL.command(name="available-mem-threshold")
@click.argument(
"available-mem-threshold",
nargs=1,
required=True,
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_GLOBAL_available_mem_threshold(db, available_mem_threshold):
""" Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.
"""

table = "AUTO_TECHSUPPORT"
key = "GLOBAL"
data = {
"available_mem_threshold": available_mem_threshold,
}
try:
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
except Exception as err:
exit_with_error(f"Error: {err}", fg="red")


@AUTO_TECHSUPPORT_GLOBAL.command(name="min-available-mem")
@click.argument(
"min-available-mem",
nargs=1,
required=True,
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_GLOBAL_min_available_mem(db, min_available_mem):
""" Minimum free memory amount in Kb when techsupport will be executed.
"""

table = "AUTO_TECHSUPPORT"
key = "GLOBAL"
data = {
"min_available_mem": min_available_mem,
}
try:
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
except Exception as err:
exit_with_error(f"Error: {err}", fg="red")


@AUTO_TECHSUPPORT_GLOBAL.command(name="since")
@click.argument(
"since",
Expand Down Expand Up @@ -271,8 +315,12 @@ def AUTO_TECHSUPPORT_FEATURE():
"--rate-limit-interval",
help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable",
)
@click.option(
"--available-mem-threshold",
help="Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.",
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval):
def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval, available_mem_threshold):
""" Add object in AUTO_TECHSUPPORT_FEATURE. """

table = "AUTO_TECHSUPPORT_FEATURE"
Expand All @@ -282,6 +330,8 @@ def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval):
data["state"] = state
if rate_limit_interval is not None:
data["rate_limit_interval"] = rate_limit_interval
if available_mem_threshold is not None:
data["available_mem_threshold"] = available_mem_threshold

try:
add_entry_validated(db.cfgdb, table, key, data)
Expand All @@ -303,8 +353,12 @@ def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval):
"--rate-limit-interval",
help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable",
)
@click.option(
"--available-mem-threshold",
help="Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.",
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval):
def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval, available_mem_threshold):
""" Add object in AUTO_TECHSUPPORT_FEATURE. """

table = "AUTO_TECHSUPPORT_FEATURE"
Expand All @@ -314,6 +368,8 @@ def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval
data["state"] = state
if rate_limit_interval is not None:
data["rate_limit_interval"] = rate_limit_interval
if available_mem_threshold is not None:
data["available_mem_threshold"] = available_mem_threshold

try:
update_entry_validated(db.cfgdb, table, key, data)
Expand Down
117 changes: 2 additions & 115 deletions scripts/coredump_gen_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,11 @@
For more info, refer to the Event Driven TechSupport & CoreDump Mgmt HLD
"""
import os
import time
import argparse
import syslog
import re
from swsscommon.swsscommon import SonicV2Connector
from utilities_common.auto_techsupport_helper import *

# Explicity Pass this to the subprocess invoking techsupport
ENV_VAR = os.environ
PATH_PREV = ENV_VAR["PATH"] if "PATH" in ENV_VAR else ""
ENV_VAR["PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:" + PATH_PREV


def handle_coredump_cleanup(dump_name, db):
_, num_bytes = get_stats(os.path.join(CORE_DUMP_DIR, CORE_DUMP_PTRN))
Expand Down Expand Up @@ -49,8 +42,6 @@ def __init__(self, core_name, container_name, db):
self.core_name = core_name
self.container = container_name
self.db = db
self.proc_mp = {}
self.core_ts_map = {}

def handle_core_dump_creation_event(self):
if self.db.get(CFG_DB, AUTO_TS, CFG_STATE) != "enabled":
Expand All @@ -66,112 +57,8 @@ def handle_core_dump_creation_event(self):
syslog.syslog(syslog.LOG_NOTICE, msg.format(self.container, self.core_name))
return

global_cooloff = self.db.get(CFG_DB, AUTO_TS, COOLOFF)
container_cooloff = self.db.get(CFG_DB, FEATURE_KEY, COOLOFF)

try:
global_cooloff = float(global_cooloff)
except ValueError:
global_cooloff = 0.0

try:
container_cooloff = float(container_cooloff)
except ValueError:
container_cooloff = 0.0

cooloff_passed = self.verify_rate_limit_intervals(global_cooloff, container_cooloff)
if cooloff_passed:
since_cfg = self.get_since_arg()
new_file = self.invoke_ts_cmd(since_cfg)
if new_file:
self.write_to_state_db(int(time.time()), new_file)

def write_to_state_db(self, timestamp, ts_dump):
name = strip_ts_ext(ts_dump)
key = TS_MAP + "|" + name
self.db.set(STATE_DB, key, CORE_DUMP, self.core_name)
self.db.set(STATE_DB, key, TIMESTAMP, str(timestamp))
self.db.set(STATE_DB, key, CONTAINER, self.container)

def get_since_arg(self):
since_cfg = self.db.get(CFG_DB, AUTO_TS, CFG_SINCE)
if not since_cfg:
return SINCE_DEFAULT
rc, _, stderr = subprocess_exec(["date", "--date={}".format(since_cfg)], env=ENV_VAR)
if rc == 0:
return since_cfg
return SINCE_DEFAULT

def parse_ts_dump_name(self, ts_stdout):
""" Figure out the ts_dump name from the techsupport stdout """
matches = re.findall(TS_PTRN, ts_stdout)
if matches:
return matches[-1]
syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name")
return ""

def invoke_ts_cmd(self, since_cfg, num_retry=0):
cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg]
cmd = " ".join(cmd_opts)
rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR)
new_dump = ""
if rc == EXT_LOCKFAIL:
syslog.syslog(syslog.LOG_NOTICE, "Another instance of techsupport running, aborting this. stderr: {}".format(stderr))
elif rc == EXT_RETRY:
if num_retry <= MAX_RETRY_LIMIT:
return self.invoke_ts_cmd(since_cfg, num_retry+1)
else:
syslog.syslog(syslog.LOG_ERR, "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}".format(stderr))
elif rc != EXT_SUCCESS:
syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr: {}".format(rc, stderr))
else: # EXT_SUCCESS
new_dump = self.parse_ts_dump_name(stdout) # Parse the dump name
if not new_dump:
syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd))
else:
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump))
return new_dump

def verify_rate_limit_intervals(self, global_cooloff, container_cooloff):
"""Verify both the global and per-proc rate_limit_intervals have passed"""
curr_ts_list = get_ts_dumps(True)
if global_cooloff and curr_ts_list:
last_ts_dump_creation = os.path.getmtime(curr_ts_list[-1])
if time.time() - last_ts_dump_creation < global_cooloff:
msg = "Global rate_limit_interval period has not passed. Techsupport Invocation is skipped. Core: {}"
syslog.syslog(syslog.LOG_INFO, msg.format(self.core_name))
return False

self.parse_ts_map()
if container_cooloff and self.container in self.core_ts_map:
last_creation_time = self.core_ts_map[self.container][0][0]
if time.time() - last_creation_time < container_cooloff:
msg = "Per Container rate_limit_interval for {} has not passed. Techsupport Invocation is skipped. Core: {}"
syslog.syslog(syslog.LOG_INFO, msg.format(self.container, self.core_name))
return False
return True

def parse_ts_map(self):
"""Create proc_name, ts_dump & creation_time map"""
ts_keys = self.db.keys(STATE_DB, TS_MAP+"*")
if not ts_keys:
return
for ts_key in ts_keys:
data = self.db.get_all(STATE_DB, ts_key)
if not data:
continue
container_name = data.get(CONTAINER, "")
creation_time = data.get(TIMESTAMP, "")
try:
creation_time = int(creation_time)
except Exception:
continue # if the creation time is invalid, skip the entry
ts_dump = ts_key.split("|")[-1]
if container_name and container_name not in self.core_ts_map:
self.core_ts_map[container_name] = []
self.core_ts_map[container_name].append((int(creation_time), ts_dump))
for container_name in self.core_ts_map:
self.core_ts_map[container_name].sort()
invoke_ts_command_rate_limited(self.db, EVENT_TYPE_CORE, {CORE_DUMP: self.core_name}, self.container)


def main():
parser = argparse.ArgumentParser(description='Auto Techsupport Invocation and CoreDump Mgmt Script')
Expand Down
Loading

0 comments on commit aa81b97

Please sign in to comment.