Skip to content

Commit

Permalink
[techsupport] Handle minor fixes of TS Lock and update auto-TS (#2114)
Browse files Browse the repository at this point in the history
1. Print the last statement as the techsupport dump name, as some automation processes might depend of parsing the last line to infer the dump path.

Previously:
  handle_exit
  Removing lock. Exit: 0
  removed '/tmp/techsupport-lock/PID'
  removed directory '/tmp/techsupport-lock'

Updated:
  handle_exit
  Removing lock. Exit: 0
  removed '/tmp/techsupport-lock/PID'
  removed directory '/tmp/techsupport-lock'
  /var/dump/sonic_dump_r-bulldog-03_20220324_195553.tar.gz

2. Don't acquire the lock when running in NOOP mode
3. Set the set -v option just before running main so that it won't print the generate_dump code to stdout
4. Update the auto-techsupport script to handle EXT_RETRY and EXT_LOCKFAIL exit codes returned by show techsupport command.
5. Update the minor error in since argument for auto-techsupport

Signed-off-by: Vivek Keddy Karri <vkarri@nvidia.com>
  • Loading branch information
vivekrnv authored and judyjoseph committed Apr 4, 2022
1 parent 3602f99 commit 7050581
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 16 deletions.
24 changes: 16 additions & 8 deletions scripts/coredump_gen_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,18 +110,26 @@ def parse_ts_dump_name(self, ts_stdout):
syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name")
return ""

def invoke_ts_cmd(self, since_cfg):
since_cfg = "'" + since_cfg + "'"
def invoke_ts_cmd(self, since_cfg, num_retry=0):
cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg]
cmd = " ".join(cmd_opts)
rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR)
if rc:
new_dump = ""
if rc == EXT_LOCKFAIL:
syslog.syslog(syslog.LOG_NOTICE, "Another instance of techsupport running, aborting this. stderr: {}".format(stderr))
elif rc == EXT_RETRY:
if num_retry <= MAX_RETRY_LIMIT:
return self.invoke_ts_cmd(since_cfg, num_retry+1)
else:
syslog.syslog(syslog.LOG_ERR, "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}".format(stderr))
elif rc != EXT_SUCCESS:
syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr: {}".format(rc, stderr))
new_dump = self.parse_ts_dump_name(stdout)
if not new_dump:
syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd))
else:
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump))
else: # EXT_SUCCESS
new_dump = self.parse_ts_dump_name(stdout) # Parse the dump name
if not new_dump:
syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd))
else:
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump))
return new_dump

def verify_rate_limit_intervals(self, global_cooloff, container_cooloff):
Expand Down
12 changes: 8 additions & 4 deletions scripts/generate_dump
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ handle_exit()
ECODE=$?
echo "Removing lock. Exit: $ECODE" >&2
$RM $V -rf ${LOCKDIR}
# Echo the filename as the last statement if the generation succeeds
if [[ -f $TARFILE && ($ECODE == $EXT_SUCCESS || $ECODE == $RETURN_CODE) ]]; then
echo $TARFILE
fi
}

handle_signal()
Expand Down Expand Up @@ -1360,8 +1364,6 @@ main() {

# Invoke the TechSupport Cleanup Hook
setsid python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &> /tmp/techsupport_cleanup.log &

echo ${TARFILE}

if ! $SAVE_STDERR
then
Expand Down Expand Up @@ -1494,7 +1496,6 @@ while getopts ":xnvhzas:t:r:d" opt; do
;;
v)
# echo commands about to be run to stderr
set -v
V="-v"
;;
n)
Expand Down Expand Up @@ -1547,14 +1548,17 @@ fi
## Attempt Locking
##

if mkdir "${LOCKDIR}" &>/dev/null; then
if $MKDIR "${LOCKDIR}" &>/dev/null; then
trap 'handle_exit' EXIT
echo "$$" > "${PIDFILE}"
# This handler will exit the script upon receiving these interrupts
# Trap configured on EXIT will be triggered by the exit from handle_signal function
trap 'handle_signal' SIGINT SIGHUP SIGQUIT SIGTERM
echo "Lock succesfully accquired and installed signal handlers"
# Proceed with the actual code
if [[ ! -z "${V}" ]]; then
set -v
fi
main
else
# lock failed, check if the other PID is alive
Expand Down
36 changes: 34 additions & 2 deletions tests/coredump_gen_handler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import sys
import pyfakefs
import unittest
import signal
from pyfakefs.fake_filesystem_unittest import Patcher
from swsscommon import swsscommon
from utilities_common.general import load_module_from_source
from utilities_common.db import Db
from utilities_common.auto_techsupport_helper import EXT_RETRY
from .mock_tables import dbconnector

sys.path.append("scripts")
Expand All @@ -18,6 +20,9 @@
/tmp/saisdkdump
"""

def signal_handler(signum, frame):
raise Exception("Timed out!")

def set_auto_ts_cfg(redis_mock, state="disabled",
rate_limit_interval="0",
max_core_size="0.0",
Expand Down Expand Up @@ -264,7 +269,7 @@ def test_since_argument(self):
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "--since '4 days ago'" in cmd_str:
if "--since 4 days ago" in cmd_str:
patcher.fs.create_file(ts_dump)
return 0, AUTO_TS_STDOUT + ts_dump, ""
elif "date --date=4 days ago" in cmd_str:
Expand Down Expand Up @@ -330,7 +335,7 @@ def test_invalid_since_argument(self):
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "--since '2 days ago'" in cmd_str:
if "--since 2 days ago" in cmd_str:
patcher.fs.create_file(ts_dump)
print(AUTO_TS_STDOUT + ts_dump)
return 0, AUTO_TS_STDOUT + ts_dump, ""
Expand Down Expand Up @@ -396,3 +401,30 @@ def mock_cmd(cmd, env):
assert "orchagent.12345.123.core.gz" in current_fs
assert "lldpmgrd.12345.22.core.gz" in current_fs
assert "python3.12345.21.core.gz" in current_fs

def test_max_retry_ts_failure(self):
"""
Scenario: TS subprocess is continously returning EXT_RETRY
Make sure auto-ts is not exceeding the limit
"""
db_wrap = Db()
redis_mock = db_wrap.db
set_auto_ts_cfg(redis_mock, state="enabled")
set_feature_table_cfg(redis_mock, state="enabled")
with Patcher() as patcher:
def mock_cmd(cmd, env):
return EXT_RETRY, "", ""

cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz")
cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock)

signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(5) # 5 seconds
try:
cls.handle_core_dump_creation_event()
except Exception:
assert False, "Method should not time out"
finally:
signal.alarm(0)

10 changes: 8 additions & 2 deletions utilities_common/auto_techsupport_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
"CORE_DUMP_DIR", "CORE_DUMP_PTRN", "TS_DIR", "TS_PTRN",
"CFG_DB", "AUTO_TS", "CFG_STATE", "CFG_MAX_TS", "COOLOFF",
"CFG_CORE_USAGE", "CFG_SINCE", "FEATURE", "STATE_DB",
"TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER",
"TIME_BUF", "SINCE_DEFAULT", "TS_PTRN_GLOB"
"TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER", "TIME_BUF",
"SINCE_DEFAULT", "TS_PTRN_GLOB", "EXT_LOCKFAIL", "EXT_RETRY",
"EXT_SUCCESS", "MAX_RETRY_LIMIT"
] + [ # Methods
"verify_recent_file_creation",
"get_ts_dumps",
Expand Down Expand Up @@ -60,6 +61,11 @@
TIME_BUF = 20
SINCE_DEFAULT = "2 days ago"

# Techsupport Exit Codes
EXT_LOCKFAIL = 2
EXT_RETRY = 4
EXT_SUCCESS = 0
MAX_RETRY_LIMIT = 2

# Helper methods
def subprocess_exec(cmd, env=None):
Expand Down

0 comments on commit 7050581

Please sign in to comment.