[auto-ts] add memory check (#2116)

- What I did Implemented memory threashold check in auto techsupport feature according to sonic-net/SONiC#939. - How I did it Added two scripts. The check script and the handler script. Few modifications made in auto tech implementation. UT added. - How to verify it Run the action script and the handler script on the switch. Run UT. Signed-off-by: Stepan Blyschak <stepanb@nvidia.com>
sonic-net · May 14, 2022 · aa81b97 · aa81b97
1 parent b370290
commit aa81b97
Show file tree

Hide file tree

Showing 14 changed files with 673 additions and 135 deletions.
diff --git a/config/plugins/auto_techsupport.py b/config/plugins/auto_techsupport.py
@@ -228,6 +228,50 @@ def AUTO_TECHSUPPORT_GLOBAL_max_core_limit(db, max_core_limit):
         exit_with_error(f"Error: {err}", fg="red")
 
 
+@AUTO_TECHSUPPORT_GLOBAL.command(name="available-mem-threshold")
+@click.argument(
+    "available-mem-threshold",
+    nargs=1,
+    required=True,
+)
+@clicommon.pass_db
+def AUTO_TECHSUPPORT_GLOBAL_available_mem_threshold(db, available_mem_threshold):
+    """ Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.
+    """
+
+    table = "AUTO_TECHSUPPORT"
+    key = "GLOBAL"
+    data = {
+        "available_mem_threshold": available_mem_threshold,
+    }
+    try:
+        update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
+    except Exception as err:
+        exit_with_error(f"Error: {err}", fg="red")
+
+
+@AUTO_TECHSUPPORT_GLOBAL.command(name="min-available-mem")
+@click.argument(
+    "min-available-mem",
+    nargs=1,
+    required=True,
+)
+@clicommon.pass_db
+def AUTO_TECHSUPPORT_GLOBAL_min_available_mem(db, min_available_mem):
+    """ Minimum free memory amount in Kb when techsupport will be executed.
+    """
+
+    table = "AUTO_TECHSUPPORT"
+    key = "GLOBAL"
+    data = {
+        "min_available_mem": min_available_mem,
+    }
+    try:
+        update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
+    except Exception as err:
+        exit_with_error(f"Error: {err}", fg="red")
+
+
 @AUTO_TECHSUPPORT_GLOBAL.command(name="since")
 @click.argument(
     "since",
@@ -271,8 +315,12 @@ def AUTO_TECHSUPPORT_FEATURE():
     "--rate-limit-interval",
     help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable",
 )
+@click.option(
+    "--available-mem-threshold",
+    help="Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.",
+)
 @clicommon.pass_db
-def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval):
+def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval, available_mem_threshold):
     """ Add object in AUTO_TECHSUPPORT_FEATURE. """
 
     table = "AUTO_TECHSUPPORT_FEATURE"
@@ -282,6 +330,8 @@ def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval):
         data["state"] = state
     if rate_limit_interval is not None:
         data["rate_limit_interval"] = rate_limit_interval
+    if available_mem_threshold is not None:
+        data["available_mem_threshold"] = available_mem_threshold
 
     try:
         add_entry_validated(db.cfgdb, table, key, data)
@@ -303,8 +353,12 @@ def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval):
     "--rate-limit-interval",
     help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable",
 )
+@click.option(
+    "--available-mem-threshold",
+    help="Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.",
+)
 @clicommon.pass_db
-def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval):
+def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval, available_mem_threshold):
     """ Add object in AUTO_TECHSUPPORT_FEATURE. """
 
     table = "AUTO_TECHSUPPORT_FEATURE"
@@ -314,6 +368,8 @@ def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval
         data["state"] = state
     if rate_limit_interval is not None:
         data["rate_limit_interval"] = rate_limit_interval
+    if available_mem_threshold is not None:
+        data["available_mem_threshold"] = available_mem_threshold
 
     try:
         update_entry_validated(db.cfgdb, table, key, data)

diff --git a/scripts/coredump_gen_handler.py b/scripts/coredump_gen_handler.py
@@ -5,18 +5,11 @@
     For more info, refer to the Event Driven TechSupport & CoreDump Mgmt HLD
 """
 import os
-import time
 import argparse
 import syslog
-import re
 from swsscommon.swsscommon import SonicV2Connector
 from utilities_common.auto_techsupport_helper import *
 
-# Explicity Pass this to the subprocess invoking techsupport
-ENV_VAR = os.environ
-PATH_PREV = ENV_VAR["PATH"] if "PATH" in ENV_VAR else ""
-ENV_VAR["PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:" + PATH_PREV
-
 
 def handle_coredump_cleanup(dump_name, db):
     _, num_bytes = get_stats(os.path.join(CORE_DUMP_DIR, CORE_DUMP_PTRN))
@@ -49,8 +42,6 @@ def __init__(self, core_name, container_name, db):
         self.core_name = core_name
         self.container = container_name
         self.db = db
-        self.proc_mp = {}
-        self.core_ts_map = {}
 
     def handle_core_dump_creation_event(self):
         if self.db.get(CFG_DB, AUTO_TS, CFG_STATE) != "enabled":
@@ -66,112 +57,8 @@ def handle_core_dump_creation_event(self):
             syslog.syslog(syslog.LOG_NOTICE, msg.format(self.container, self.core_name))
             return
 
-        global_cooloff = self.db.get(CFG_DB, AUTO_TS, COOLOFF)  
-        container_cooloff = self.db.get(CFG_DB, FEATURE_KEY, COOLOFF)
-
-        try:
-            global_cooloff = float(global_cooloff)
-        except ValueError:
-            global_cooloff = 0.0
-
-        try:
-            container_cooloff = float(container_cooloff)
-        except ValueError:
-            container_cooloff = 0.0
-
-        cooloff_passed = self.verify_rate_limit_intervals(global_cooloff, container_cooloff)
-        if cooloff_passed:
-            since_cfg = self.get_since_arg()
-            new_file = self.invoke_ts_cmd(since_cfg)
-            if new_file:
-                self.write_to_state_db(int(time.time()), new_file)
-
-    def write_to_state_db(self, timestamp, ts_dump):
-        name = strip_ts_ext(ts_dump)
-        key = TS_MAP + "|" + name
-        self.db.set(STATE_DB, key, CORE_DUMP, self.core_name)
-        self.db.set(STATE_DB, key, TIMESTAMP, str(timestamp))
-        self.db.set(STATE_DB, key, CONTAINER, self.container)
-
-    def get_since_arg(self):
-        since_cfg = self.db.get(CFG_DB, AUTO_TS, CFG_SINCE)
-        if not since_cfg:
-            return SINCE_DEFAULT
-        rc, _, stderr = subprocess_exec(["date", "--date={}".format(since_cfg)], env=ENV_VAR)
-        if rc == 0:
-            return since_cfg
-        return SINCE_DEFAULT
-
-    def parse_ts_dump_name(self, ts_stdout):
-        """ Figure out the ts_dump name from the techsupport stdout """
-        matches = re.findall(TS_PTRN, ts_stdout)
-        if matches:
-            return matches[-1]
-        syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name")
-        return ""
-
-    def invoke_ts_cmd(self, since_cfg, num_retry=0):
-        cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg]
-        cmd  = " ".join(cmd_opts)
-        rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR)
-        new_dump = ""
-        if rc == EXT_LOCKFAIL:
-            syslog.syslog(syslog.LOG_NOTICE, "Another instance of techsupport running, aborting this. stderr: {}".format(stderr))
-        elif rc == EXT_RETRY:
-            if num_retry <= MAX_RETRY_LIMIT:
-                return self.invoke_ts_cmd(since_cfg, num_retry+1)
-            else:
-                syslog.syslog(syslog.LOG_ERR, "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}".format(stderr))
-        elif rc != EXT_SUCCESS:
-            syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr: {}".format(rc, stderr))
-        else: # EXT_SUCCESS
-            new_dump = self.parse_ts_dump_name(stdout) # Parse the dump name
-            if not new_dump:
-                syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd))
-            else:
-                syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump))
-        return new_dump
-
-    def verify_rate_limit_intervals(self, global_cooloff, container_cooloff):
-        """Verify both the global and per-proc rate_limit_intervals have passed"""
-        curr_ts_list = get_ts_dumps(True)
-        if global_cooloff and curr_ts_list:
-            last_ts_dump_creation = os.path.getmtime(curr_ts_list[-1])
-            if time.time() - last_ts_dump_creation < global_cooloff:
-                msg = "Global rate_limit_interval period has not passed. Techsupport Invocation is skipped. Core: {}"
-                syslog.syslog(syslog.LOG_INFO, msg.format(self.core_name))
-                return False
-
-        self.parse_ts_map()
-        if container_cooloff and self.container in self.core_ts_map:
-            last_creation_time = self.core_ts_map[self.container][0][0]
-            if time.time() - last_creation_time < container_cooloff:
-                msg = "Per Container rate_limit_interval for {} has not passed. Techsupport Invocation is skipped. Core: {}"
-                syslog.syslog(syslog.LOG_INFO, msg.format(self.container, self.core_name))
-                return False
-        return True
-
-    def parse_ts_map(self):
-        """Create proc_name, ts_dump & creation_time map"""
-        ts_keys = self.db.keys(STATE_DB, TS_MAP+"*")
-        if not ts_keys:
-            return
-        for ts_key in ts_keys:
-            data = self.db.get_all(STATE_DB, ts_key)
-            if not data:
-                continue
-            container_name = data.get(CONTAINER, "")
-            creation_time = data.get(TIMESTAMP, "")
-            try:
-                creation_time = int(creation_time)
-            except Exception:
-                continue  # if the creation time is invalid, skip the entry
-            ts_dump = ts_key.split("|")[-1]
-            if container_name and container_name not in self.core_ts_map:
-                self.core_ts_map[container_name] = []
-            self.core_ts_map[container_name].append((int(creation_time), ts_dump))
-        for container_name in self.core_ts_map:
-            self.core_ts_map[container_name].sort()
+        invoke_ts_command_rate_limited(self.db, EVENT_TYPE_CORE, {CORE_DUMP: self.core_name}, self.container)
+
 
 def main():
     parser = argparse.ArgumentParser(description='Auto Techsupport Invocation and CoreDump Mgmt Script')