Skip to content

Commit

Permalink
[warm-reboot] Add new preboot health check: verify database integrity (
Browse files Browse the repository at this point in the history
…sonic-net#1785)

What I did
Verify database integrity before proceeding with warm reboot or fast reboot.
This integrity check uses a JSON schema to validate DBs. To start with, only counters_db's table COUNTERS_PORT_NAME_MAP presence is verified. But, this list can advance in future.
The test logic is designed to be generic; any more databases or tables within them can be just added to schema list, and the verification logic needs no change.
How I did it
Added a JSON schema, and generic schema validation logic.
  • Loading branch information
vaibhavhd authored Sep 13, 2021
1 parent 41e31e8 commit c007d65
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 3 deletions.
84 changes: 84 additions & 0 deletions scripts/check_db_integrity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python3

"""
This is to verify if Database has critical tables present before warmboot can proceed.
If warmboot is allowed with missing critical tables, it can lead to issues in going
down path or during the recovery path. This test detects such issues before proceeding.
The verification procedure here uses JSON schemas to verify the DB entities.
In future, to verify new tables or their content, just the schema modification is needed.
No modification may be needed to the integrity check logic.
"""

import os, sys
import json, jsonschema
import syslog
import subprocess
import traceback

DB_SCHEMA = {
"COUNTERS_DB":
{
"$schema": "http://json-schema.org/draft-06/schema",
"type": "object",
"title": "Schema for COUNTERS DB's entities",
"required": ["COUNTERS_PORT_NAME_MAP"],
"properties": {
"COUNTERS_PORT_NAME_MAP": {"$id": "#/properties/COUNTERS_PORT_NAME_MAP", "type": "object"}
}
}
}


def main():
if not DB_SCHEMA:
return 0

for db_name, schema in DB_SCHEMA.items():
db_dump_file = "/tmp/{}.json".format(db_name)
dump_db_cmd = "sonic-db-dump -n 'COUNTERS_DB' -y > {}".format(db_dump_file)
p = subprocess.Popen(dump_db_cmd, shell=True, text=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(_, err) = p.communicate()
rc = p.wait()
if rc != 0:
print("Failed to dump db {}. Return code: {} with err: {}".format(db_name, rc, err))

try:
with open(db_dump_file) as fp:
db_dump_data = json.load(fp)
except ValueError as err:
syslog.syslog(syslog.LOG_DEBUG, "DB json file is not a valid json file. " +\
"Error: {}".format(str(err)))
return 1

# What: Validate if critical tables and entries are present in DB.
# Why: This is needed to avoid warmbooting with a bad DB; which can
# potentially trigger failures in the reboot recovery path.
# How: Validate DB against a schema which defines required tables.
try:
jsonschema.validate(instance=db_dump_data, schema=schema)
except jsonschema.exceptions.ValidationError as err:
syslog.syslog(syslog.LOG_ERR, "Database is missing tables/entries needed for reboot procedure. " +\
"DB integrity check failed with:\n{}".format(str(err.message)))
return 1
syslog.syslog(syslog.LOG_DEBUG, "Database integrity checks passed.")
return 0


if __name__ == '__main__':
res = 0
try:
res = main()
except KeyboardInterrupt:
syslog.syslog(syslog.LOG_NOTICE, "SIGINT received. Quitting")
res = 1
except Exception as e:
syslog.syslog(syslog.LOG_ERR, "Got an exception %s: Traceback: %s" % (str(e), traceback.format_exc()))
res = 2
finally:
syslog.closelog()
try:
sys.exit(res)
except SystemExit:
os._exit(res)
31 changes: 28 additions & 3 deletions scripts/fast-reboot
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ SHUTDOWN_ORDER_FILE="/etc/sonic/${REBOOT_TYPE}_order"
VERBOSE=no
FORCE=no
IGNORE_ASIC=no
IGNORE_DB_CHECK=no
STRICT=no
REBOOT_METHOD="/sbin/kexec -e"
ASSISTANT_IP_LIST=""
Expand All @@ -38,6 +39,7 @@ EXIT_SYNCD_SHUTDOWN=11
EXIT_FAST_REBOOT_DUMP_FAILURE=12
EXIT_FILTER_FDB_ENTRIES_FAILURE=13
EXIT_COUNTERPOLL_DELAY_FAILURE=14
EXIT_DB_INTEGRITY_FAILURE=15
EXIT_NO_CONTROL_PLANE_ASSISTANT=20
EXIT_SONIC_INSTALLER_VERIFY_REBOOT=21

Expand All @@ -59,8 +61,9 @@ function showHelpAndExit()
echo "Usage: ${REBOOT_SCRIPT_NAME} [options]"
echo " -h,-? : get this help"
echo " -v : turn on verbose"
echo " -f : force execution"
echo " -i : ignore MD5-checksum-verification of ASIC configuration files"
echo " -f : force execution - ignore Orchagent RESTARTCHECK failure"
echo " -i : force execution - ignore ASIC MD5-checksum-verification"
echo " -d : force execution - ignore database integrity check"
echo " -r : reboot with /sbin/reboot"
echo " -k : reboot with /sbin/kexec -e [default]"
echo " -x : execute script with -x flag"
Expand All @@ -74,7 +77,7 @@ function showHelpAndExit()

function parseOptions()
{
while getopts "vfih?rkxc:s" opt; do
while getopts "vfidh?rkxc:s" opt; do
case ${opt} in
h|\? )
showHelpAndExit
Expand All @@ -88,6 +91,9 @@ function parseOptions()
i )
IGNORE_ASIC=yes
;;
d )
IGNORE_DB_CHECK=yes
;;
r )
REBOOT_METHOD="/sbin/reboot"
;;
Expand Down Expand Up @@ -327,6 +333,23 @@ function check_docker_exec()
done
}
function check_db_integrity()
{
if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
CHECK_DB_INTEGRITY=0
/usr/local/bin/check_db_integrity.py || CHECK_DB_INTEGRITY=$?
if [[ CHECK_DB_INTEGRITY -ne 0 ]]; then
if [[ x"${IGNORE_DB_CHECK}" == x"yes" ]]; then
debug "Ignoring Database integrity checks..."
else
error "Failed to validate DB's integrity. Exit code: ${CHECK_DB_INTEGRITY}. \
Use '-d' option to force ignore this check."
exit ${EXIT_DB_INTEGRITY_FAILURE}
fi
fi
fi
}
function reboot_pre_check()
{
check_docker_exec
Expand All @@ -337,6 +360,8 @@ function reboot_pre_check()
fi
rm ${filename}
check_db_integrity
# Make sure /host has enough space for warm reboot temp files
avail=$(df -k /host | tail -1 | awk '{ print $4 }')
if [[ ${avail} -lt ${MIN_HD_SPACE_NEEDED} ]]; then
Expand Down

0 comments on commit c007d65

Please sign in to comment.