Skip to content

Commit

Permalink
[techsupport] Handle minor fixes of TS Lock and update auto-TS (sonic…
Browse files Browse the repository at this point in the history
…-net#2114)

1. Print the last statement as the techsupport dump name, as some automation processes might depend of parsing the last line to infer the dump path.

Previously:
  handle_exit
  Removing lock. Exit: 0
  removed '/tmp/techsupport-lock/PID'
  removed directory '/tmp/techsupport-lock'

Updated:
  handle_exit
  Removing lock. Exit: 0
  removed '/tmp/techsupport-lock/PID'
  removed directory '/tmp/techsupport-lock'
  /var/dump/sonic_dump_r-bulldog-03_20220324_195553.tar.gz

2. Don't acquire the lock when running in NOOP mode
3. Set the set -v option just before running main so that it won't print the generate_dump code to stdout
4. Update the auto-techsupport script to handle EXT_RETRY and EXT_LOCKFAIL exit codes returned by show techsupport command.
5. Update the minor error in since argument for auto-techsupport

Signed-off-by: Vivek Keddy Karri <vkarri@nvidia.com>
  • Loading branch information
vivekrnv authored Apr 3, 2022
1 parent 51d3550 commit f70dc27
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 16 deletions.
24 changes: 16 additions & 8 deletions scripts/coredump_gen_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,18 +110,26 @@ def parse_ts_dump_name(self, ts_stdout):
syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name")
return ""

def invoke_ts_cmd(self, since_cfg):
since_cfg = "'" + since_cfg + "'"
def invoke_ts_cmd(self, since_cfg, num_retry=0):
cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg]
cmd = " ".join(cmd_opts)
rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR)
if rc:
new_dump = ""
if rc == EXT_LOCKFAIL:
syslog.syslog(syslog.LOG_NOTICE, "Another instance of techsupport running, aborting this. stderr: {}".format(stderr))
elif rc == EXT_RETRY:
if num_retry <= MAX_RETRY_LIMIT:
return self.invoke_ts_cmd(since_cfg, num_retry+1)
else:
syslog.syslog(syslog.LOG_ERR, "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}".format(stderr))
elif rc != EXT_SUCCESS:
syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr: {}".format(rc, stderr))
new_dump = self.parse_ts_dump_name(stdout)
if not new_dump:
syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd))
else:
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump))
else: # EXT_SUCCESS
new_dump = self.parse_ts_dump_name(stdout) # Parse the dump name
if not new_dump:
syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd))
else:
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump))
return new_dump

def verify_rate_limit_intervals(self, global_cooloff, container_cooloff):
Expand Down
12 changes: 8 additions & 4 deletions scripts/generate_dump
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ handle_exit()
ECODE=$?
echo "Removing lock. Exit: $ECODE" >&2
$RM $V -rf ${LOCKDIR}
# Echo the filename as the last statement if the generation succeeds
if [[ -f $TARFILE && ($ECODE == $EXT_SUCCESS || $ECODE == $RETURN_CODE) ]]; then
echo $TARFILE
fi
}

handle_signal()
Expand Down Expand Up @@ -1360,8 +1364,6 @@ main() {

# Invoke the TechSupport Cleanup Hook
setsid python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &> /tmp/techsupport_cleanup.log &

echo ${TARFILE}

if ! $SAVE_STDERR
then
Expand Down Expand Up @@ -1494,7 +1496,6 @@ while getopts ":xnvhzas:t:r:d" opt; do
;;
v)
# echo commands about to be run to stderr
set -v
V="-v"
;;
n)
Expand Down Expand Up @@ -1547,14 +1548,17 @@ fi
## Attempt Locking
##

if mkdir "${LOCKDIR}" &>/dev/null; then
if $MKDIR "${LOCKDIR}" &>/dev/null; then
trap 'handle_exit' EXIT
echo "$$" > "${PIDFILE}"
# This handler will exit the script upon receiving these interrupts
# Trap configured on EXIT will be triggered by the exit from handle_signal function
trap 'handle_signal' SIGINT SIGHUP SIGQUIT SIGTERM
echo "Lock succesfully accquired and installed signal handlers"
# Proceed with the actual code
if [[ ! -z "${V}" ]]; then
set -v
fi
main
else
# lock failed, check if the other PID is alive
Expand Down
36 changes: 34 additions & 2 deletions tests/coredump_gen_handler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import sys
import pyfakefs
import unittest
import signal
from pyfakefs.fake_filesystem_unittest import Patcher
from swsscommon import swsscommon
from utilities_common.general import load_module_from_source
from utilities_common.db import Db
from utilities_common.auto_techsupport_helper import EXT_RETRY
from .mock_tables import dbconnector

sys.path.append("scripts")
Expand All @@ -18,6 +20,9 @@
/tmp/saisdkdump
"""

def signal_handler(signum, frame):
raise Exception("Timed out!")

def set_auto_ts_cfg(redis_mock, state="disabled",
rate_limit_interval="0",
max_core_size="0.0",
Expand Down Expand Up @@ -264,7 +269,7 @@ def test_since_argument(self):
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "--since '4 days ago'" in cmd_str:
if "--since 4 days ago" in cmd_str:
patcher.fs.create_file(ts_dump)
return 0, AUTO_TS_STDOUT + ts_dump, ""
elif "date --date=4 days ago" in cmd_str:
Expand Down Expand Up @@ -330,7 +335,7 @@ def test_invalid_since_argument(self):
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "--since '2 days ago'" in cmd_str:
if "--since 2 days ago" in cmd_str:
patcher.fs.create_file(ts_dump)
print(AUTO_TS_STDOUT + ts_dump)
return 0, AUTO_TS_STDOUT + ts_dump, ""
Expand Down Expand Up @@ -396,3 +401,30 @@ def mock_cmd(cmd, env):
assert "orchagent.12345.123.core.gz" in current_fs
assert "lldpmgrd.12345.22.core.gz" in current_fs
assert "python3.12345.21.core.gz" in current_fs

def test_max_retry_ts_failure(self):
"""
Scenario: TS subprocess is continously returning EXT_RETRY
Make sure auto-ts is not exceeding the limit
"""
db_wrap = Db()
redis_mock = db_wrap.db
set_auto_ts_cfg(redis_mock, state="enabled")
set_feature_table_cfg(redis_mock, state="enabled")
with Patcher() as patcher:
def mock_cmd(cmd, env):
return EXT_RETRY, "", ""

cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz")
cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock)

signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(5) # 5 seconds
try:
cls.handle_core_dump_creation_event()
except Exception:
assert False, "Method should not time out"
finally:
signal.alarm(0)

10 changes: 8 additions & 2 deletions utilities_common/auto_techsupport_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
"CORE_DUMP_DIR", "CORE_DUMP_PTRN", "TS_DIR", "TS_PTRN",
"CFG_DB", "AUTO_TS", "CFG_STATE", "CFG_MAX_TS", "COOLOFF",
"CFG_CORE_USAGE", "CFG_SINCE", "FEATURE", "STATE_DB",
"TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER",
"TIME_BUF", "SINCE_DEFAULT", "TS_PTRN_GLOB"
"TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER", "TIME_BUF",
"SINCE_DEFAULT", "TS_PTRN_GLOB", "EXT_LOCKFAIL", "EXT_RETRY",
"EXT_SUCCESS", "MAX_RETRY_LIMIT"
] + [ # Methods
"verify_recent_file_creation",
"get_ts_dumps",
Expand Down Expand Up @@ -60,6 +61,11 @@
TIME_BUF = 20
SINCE_DEFAULT = "2 days ago"

# Techsupport Exit Codes
EXT_LOCKFAIL = 2
EXT_RETRY = 4
EXT_SUCCESS = 0
MAX_RETRY_LIMIT = 2

# Helper methods
def subprocess_exec(cmd, env=None):
Expand Down

0 comments on commit f70dc27

Please sign in to comment.