Skip to content

Commit

Permalink
[Auto Techsupport] Event driven Techsupport Changes (sonic-net#1796)
Browse files Browse the repository at this point in the history
#### What I did

sonic-utilities changes required for feature "Event Driven TechSupport Invocation & CoreDump Mgmt". [HLD](sonic-net/SONiC#818 )

Summary of the changes:

- Added the AUTO GEN CLI for the CFG DB tables required for this feature
- Added the coredump_gen_handler.py & techsupport_cleanup.py scripts.
- Added the UT's required for these scripts.
- Enhanced coredump-compress & generate-dump scripts
  • Loading branch information
vivekrnv authored Nov 16, 2021
1 parent efa2ff6 commit a3e34e3
Show file tree
Hide file tree
Showing 10 changed files with 1,458 additions and 0 deletions.
350 changes: 350 additions & 0 deletions config/plugins/auto_techsupport.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,350 @@
"""
Autogenerated config CLI plugin.
"""

import click
import utilities_common.cli as clicommon
import utilities_common.general as general
from config import config_mgmt


# Load sonic-cfggen from source since /usr/local/bin/sonic-cfggen does not have .py extension.
sonic_cfggen = general.load_module_from_source('sonic_cfggen', '/usr/local/bin/sonic-cfggen')


def exit_with_error(*args, **kwargs):
""" Print a message and abort CLI. """

click.secho(*args, **kwargs)
raise click.Abort()


def validate_config_or_raise(cfg):
""" Validate config db data using ConfigMgmt """

try:
cfg = sonic_cfggen.FormatConverter.to_serialized(cfg)
config_mgmt.ConfigMgmt().loadData(cfg)
except Exception as err:
raise Exception('Failed to validate configuration: {}'.format(err))


def add_entry_validated(db, table, key, data):
""" Add new entry in table and validate configuration """

cfg = db.get_config()
cfg.setdefault(table, {})
if key in cfg[table]:
raise Exception(f"{key} already exists")

cfg[table][key] = data

validate_config_or_raise(cfg)
db.set_entry(table, key, data)


def update_entry_validated(db, table, key, data, create_if_not_exists=False):
""" Update entry in table and validate configuration.
If attribute value in data is None, the attribute is deleted.
"""

cfg = db.get_config()
cfg.setdefault(table, {})

if create_if_not_exists:
cfg[table].setdefault(key, {})

if key not in cfg[table]:
raise Exception(f"{key} does not exist")

for attr, value in data.items():
if value is None and attr in cfg[table][key]:
cfg[table][key].pop(attr)
else:
cfg[table][key][attr] = value

validate_config_or_raise(cfg)
db.set_entry(table, key, cfg[table][key])


def del_entry_validated(db, table, key):
""" Delete entry in table and validate configuration """

cfg = db.get_config()
cfg.setdefault(table, {})
if key not in cfg[table]:
raise Exception(f"{key} does not exist")

cfg[table].pop(key)

validate_config_or_raise(cfg)
db.set_entry(table, key, None)


def add_list_entry_validated(db, table, key, attr, data):
""" Add new entry into list in table and validate configuration"""

cfg = db.get_config()
cfg.setdefault(table, {})
if key not in cfg[table]:
raise Exception(f"{key} does not exist")
cfg[table][key].setdefault(attr, [])
for entry in data:
if entry in cfg[table][key][attr]:
raise Exception(f"{entry} already exists")
cfg[table][key][attr].append(entry)

validate_config_or_raise(cfg)
db.set_entry(table, key, cfg[table][key])


def del_list_entry_validated(db, table, key, attr, data):
""" Delete entry from list in table and validate configuration"""

cfg = db.get_config()
cfg.setdefault(table, {})
if key not in cfg[table]:
raise Exception(f"{key} does not exist")
cfg[table][key].setdefault(attr, [])
for entry in data:
if entry not in cfg[table][key][attr]:
raise Exception(f"{entry} does not exist")
cfg[table][key][attr].remove(entry)
if not cfg[table][key][attr]:
cfg[table][key].pop(attr)

validate_config_or_raise(cfg)
db.set_entry(table, key, cfg[table][key])


def clear_list_entry_validated(db, table, key, attr):
""" Clear list in object and validate configuration"""

update_entry_validated(db, table, key, {attr: None})


@click.group(name="auto-techsupport",
cls=clicommon.AliasedGroup)
def AUTO_TECHSUPPORT():
""" AUTO_TECHSUPPORT part of config_db.json """

pass


@AUTO_TECHSUPPORT.group(name="global",
cls=clicommon.AliasedGroup)
@clicommon.pass_db
def AUTO_TECHSUPPORT_GLOBAL(db):
""" """

pass


@AUTO_TECHSUPPORT_GLOBAL.command(name="state")
@click.argument(
"state",
nargs=1,
required=True,
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_GLOBAL_state(db, state):
""" Knob to make techsupport invocation event-driven based on core-dump generation """

table = "AUTO_TECHSUPPORT"
key = "GLOBAL"
data = {
"state": state,
}
try:
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
except Exception as err:
exit_with_error(f"Error: {err}", fg="red")


@AUTO_TECHSUPPORT_GLOBAL.command(name="rate-limit-interval")
@click.argument(
"rate-limit-interval",
nargs=1,
required=True,
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_GLOBAL_rate_limit_interval(db, rate_limit_interval):
""" Minimum time in seconds between two successive techsupport invocations. Configure 0 to explicitly disable """

table = "AUTO_TECHSUPPORT"
key = "GLOBAL"
data = {
"rate_limit_interval": rate_limit_interval,
}
try:
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
except Exception as err:
exit_with_error(f"Error: {err}", fg="red")


@AUTO_TECHSUPPORT_GLOBAL.command(name="max-techsupport-limit")
@click.argument(
"max-techsupport-limit",
nargs=1,
required=True,
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_GLOBAL_max_techsupport_limit(db, max_techsupport_limit):
""" Max Limit in percentage for the cummulative size of ts dumps.
No cleanup is performed if the value isn't configured or is 0.0
"""

table = "AUTO_TECHSUPPORT"
key = "GLOBAL"
data = {
"max_techsupport_limit": max_techsupport_limit,
}
try:
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
except Exception as err:
exit_with_error(f"Error: {err}", fg="red")


@AUTO_TECHSUPPORT_GLOBAL.command(name="max-core-limit")
@click.argument(
"max-core-limit",
nargs=1,
required=True,
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_GLOBAL_max_core_limit(db, max_core_limit):
""" Max Limit in percentage for the cummulative size of core dumps.
No cleanup is performed if the value isn't congiured or is 0.0
"""

table = "AUTO_TECHSUPPORT"
key = "GLOBAL"
data = {
"max_core_limit": max_core_limit,
}
try:
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
except Exception as err:
exit_with_error(f"Error: {err}", fg="red")


@AUTO_TECHSUPPORT_GLOBAL.command(name="since")
@click.argument(
"since",
nargs=1,
required=True,
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_GLOBAL_since(db, since):
""" Only collect the logs & core-dumps generated since the time provided.
A default value of '2 days ago' is used if this value is not set explicitly or a non-valid string is provided """

table = "AUTO_TECHSUPPORT"
key = "GLOBAL"
data = {
"since": since,
}
try:
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
except Exception as err:
exit_with_error(f"Error: {err}", fg="red")


@click.group(name="auto-techsupport-feature",
cls=clicommon.AliasedGroup)
def AUTO_TECHSUPPORT_FEATURE():
""" AUTO_TECHSUPPORT_FEATURE part of config_db.json """
pass


@AUTO_TECHSUPPORT_FEATURE.command(name="add")
@click.argument(
"feature-name",
nargs=1,
required=True,
)
@click.option(
"--state",
help="Enable auto techsupport invocation on the processes running inside this feature",
)
@click.option(
"--rate-limit-interval",
help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable",
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval):
""" Add object in AUTO_TECHSUPPORT_FEATURE. """

table = "AUTO_TECHSUPPORT_FEATURE"
key = feature_name
data = {}
if state is not None:
data["state"] = state
if rate_limit_interval is not None:
data["rate_limit_interval"] = rate_limit_interval

try:
add_entry_validated(db.cfgdb, table, key, data)
except Exception as err:
exit_with_error(f"Error: {err}", fg="red")


@AUTO_TECHSUPPORT_FEATURE.command(name="update")
@click.argument(
"feature-name",
nargs=1,
required=True,
)
@click.option(
"--state",
help="Enable auto techsupport invocation on the processes running inside this feature",
)
@click.option(
"--rate-limit-interval",
help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable",
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval):
""" Add object in AUTO_TECHSUPPORT_FEATURE. """

table = "AUTO_TECHSUPPORT_FEATURE"
key = feature_name
data = {}
if state is not None:
data["state"] = state
if rate_limit_interval is not None:
data["rate_limit_interval"] = rate_limit_interval

try:
update_entry_validated(db.cfgdb, table, key, data)
except Exception as err:
exit_with_error(f"Error: {err}", fg="red")


@AUTO_TECHSUPPORT_FEATURE.command(name="delete")
@click.argument(
"feature-name",
nargs=1,
required=True,
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_FEATURE_delete(db, feature_name):
""" Delete object in AUTO_TECHSUPPORT_FEATURE. """

table = "AUTO_TECHSUPPORT_FEATURE"
key = feature_name
try:
del_entry_validated(db.cfgdb, table, key)
except Exception as err:
exit_with_error(f"Error: {err}", fg="red")


def register(cli):
cli_node = AUTO_TECHSUPPORT
if cli_node.name in cli.commands:
raise Exception(f"{cli_node.name} already exists in CLI")
cli.add_command(AUTO_TECHSUPPORT)
cli_node = AUTO_TECHSUPPORT_FEATURE
if cli_node.name in cli.commands:
raise Exception(f"{cli_node.name} already exists in CLI")
cli.add_command(AUTO_TECHSUPPORT_FEATURE)
17 changes: 17 additions & 0 deletions scripts/coredump-compress
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,28 @@ while [[ $# > 1 ]]; do
shift
done

CONTAINER_ID=""
if [ $# > 0 ]; then
CONTAINER_ID=$(xargs -0 -L1 -a /proc/${1}/cgroup | grep -oP "pids:/docker/\K\w+")
ns=`xargs -0 -L1 -a /proc/${1}/environ | grep -e "^NAMESPACE_ID" | cut -f2 -d'='`
if [ ! -z ${ns} ]; then
PREFIX=${PREFIX}${ns}.
fi
fi

/bin/gzip -1 - > /var/core/${PREFIX}core.gz

if [[ ! -z $CONTAINER_ID ]]; then
CONTAINER_NAME=$(docker inspect --format='{{.Name}}' ${CONTAINER_ID} | cut -c2-)
if [[ ! -z ${CONTAINER_NAME} ]]; then
# coredump_gen_handler invokes techsupport if all the other required conditions are met
# explicitly passing in the env vars because coredump-compress's namespace doesn't have these set by default
for path in $(find /usr/local/lib/python3*/dist-packages -maxdepth 0); do
PYTHONPATH=$PYTHONPATH:$path
done
setsid $(echo > /tmp/coredump_gen_handler.log;
export PYTHONPATH=$PYTHONPATH;
python3 /usr/local/bin/coredump_gen_handler.py ${PREFIX}core.gz ${CONTAINER_NAME} &>> /tmp/coredump_gen_handler.log) &
fi
fi

Loading

0 comments on commit a3e34e3

Please sign in to comment.