Skip to content

Commit

Permalink
Detect and log control plane drops (#20481)
Browse files Browse the repository at this point in the history
Detect and log control plane drops

Signed-off-by: Prabhat Aravind <paravind@microsoft.com>
  • Loading branch information
prabhataravind authored Nov 7, 2024
1 parent 9685498 commit 2177af1
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 0 deletions.
2 changes: 2 additions & 0 deletions files/build_templates/sonic_debian_extension.j2
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,8 @@ sudo cp $IMAGE_CONFIGS/monit/restart_service $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/restart_service
sudo cp $IMAGE_CONFIGS/monit/arp_update_checker $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/arp_update_checker
sudo cp $IMAGE_CONFIGS/monit/control_plane_drop_check $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/control_plane_drop_check

# Installed smartmontools version should match installed smartmontools in docker-platform-monitor Dockerfile
# TODO: are mismatching versions fine for bookworm?
Expand Down
5 changes: 5 additions & 0 deletions files/image_config/monit/conf.d/sonic-host
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,8 @@ check program memory_check with path "/usr/local/bin/memory_threshold_check.py"
# arp_update_checker tool that verifies that arp_update script is not stuck on ping command every 10 minutes
check program arp_update_checker with path "/usr/bin/arp_update_checker" every 10 cycles
if status != 0 for 3 times within 3 cycles then alert repeat every 1 cycles

# Check if there are control plane packet drops reported by softnet_stats
check program controlPlaneDropCheck with path "/usr/bin/control_plane_drop_check"
every 5 cycles
if status != 0 for 3 cycle then alert repeat every 1 cycles
92 changes: 92 additions & 0 deletions files/image_config/monit/control_plane_drop_check
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3
"""
This scripts's job is to verify that there are no control plane
packet drops reported by /proc/net/sofnet_stats.
This is to be run periodically on a SONiC device using a monit
configuration file.
"""
import os
import sys
import syslog


def write_syslog(message, *args):
"""
Write a message to syslog.
Args:
message (str): Message string to be logged
args: Optional args
Returns:
None
"""

if args:
message %= args
syslog.syslog(syslog.LOG_NOTICE, message)


def get_softnet_dropped_count():
"""
Get dropped count from softnet stats procfs.
Returns:
drop_count (int): Number of dropped packets
"""
drop_count = 0
softnet_stats_file = "/proc/net/softnet_stat"

with open(softnet_stats_file, 'r') as f:
for line in f:
if line.strip(): # Ensure the line is not empty
stat = line.split()
# Drop count is in the second column for each CPU. Ref:
# https://github.com/torvalds/linux/blob/v5.10/net/core/net-procfs.c#L153
if len(stat) > 1:
drop_count += int(stat[1], 16)

return drop_count


def check_packet_drops():
"""
The function that checks for kernel packet drops
Returns:
True if there are packet drops, False otherwise
"""
# Path to the file to store the last drop count
drop_count_stash = '/tmp/softnet_dropped_count.txt'

# Read the last drop count if drop count stash file exists
if os.path.exists(drop_count_stash):
with open(drop_count_stash, 'r') as f:
count = f.read().strip()
last_drop_count = int(count) if count else 0
else:
# Initial run. Read drop count, update stash and return normally
last_drop_count = get_softnet_dropped_count()
with open(drop_count_stash, 'w') as f:
f.write(str(last_drop_count))
return False

current_drop_count = get_softnet_dropped_count()
with open(drop_count_stash, 'w') as f:
f.write(str(current_drop_count))

if current_drop_count > last_drop_count:
write_syslog("control_plane_drop_check: packet drops detected, current drop count: {}".format(current_drop_count))
return True
else:
return False


if __name__ == "__main__":
res = True
try:
res = check_packet_drops()
except Exception as e:
write_syslog("control_plane_drop_check exception: {}".format(str(e)))

sys.exit(1 if res else 0)

0 comments on commit 2177af1

Please sign in to comment.