Skip to content

Commit

Permalink
[service] introducing serial port watchdog service (#1743)
Browse files Browse the repository at this point in the history
* [rc.local] refactor platform identification code to separate function

Signed-off-by: Ying Xie <ying.xie@microsoft.com>

* [rc.local] infrastructure to take action according to installer.conf

* [serial port watchdog] add service to watch serial port processes

Monitor serial port processes. Kill ones stuck for too long.

Signed-off-by: Ying Xie <ying.xie@microsoft.com>

* [rc.local] start watchdog on serial port specified by installer.conf

Signed-off-by: Ying Xie <ying.xie@microsoft.com>
  • Loading branch information
yxieca authored May 25, 2018
1 parent d165a50 commit bb6ff62
Show file tree
Hide file tree
Showing 4 changed files with 385 additions and 4 deletions.
5 changes: 5 additions & 0 deletions files/build_templates/sonic_debian_extension.j2
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ sudo cp $IMAGE_CONFIGS/hostname/hostname-config.service $FILESYSTEM_ROOT/etc/sy
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable hostname-config.service
sudo cp $IMAGE_CONFIGS/hostname/hostname-config.sh $FILESYSTEM_ROOT/usr/bin/

# Copy serial-port-watchdog configuration scripts
sudo cp $IMAGE_CONFIGS/serial-port-watchdog/serial-port-watchdog.service $FILESYSTEM_ROOT/etc/systemd/system/
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable serial-port-watchdog.service
sudo cp $IMAGE_CONFIGS/serial-port-watchdog/serial-port-watchdog.py $FILESYSTEM_ROOT/usr/bin/

# Copy updategraph script and service file
sudo cp $IMAGE_CONFIGS/updategraph/updategraph.service $FILESYSTEM_ROOT/etc/systemd/system/
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable updategraph.service
Expand Down
44 changes: 40 additions & 4 deletions files/image_config/platform/rc.local
Original file line number Diff line number Diff line change
Expand Up @@ -183,15 +183,51 @@ for x in "$@"; do
done
}

eval sonic_version=$(cat /etc/sonic/sonic_version.yml | grep build_version | cut -f2 -d" ")

if [ -f /host/image-$sonic_version/platform/firsttime ]; then

setup_platform()
{
if [ -n "$aboot_platform" ]; then
platform=$aboot_platform
elif [ -n "$onie_platform" ]; then
platform=$onie_platform
else
platform=''
fi
}

# Setup default values in this function before reading installer.conf
# installer.conf could override the value set in this function.
setup_platform_defaults()
{
# Default serial port: ttyS0
CONSOLE_DEV=0
}

load_platform_installer_config()
{
INSTALLER_CFG=/usr/share/sonic/device/$platform/installer.conf
if [ -f $INSTALLER_CFG ]; then
. $INSTALLER_CFG
fi
}

program_serial_port()
{
sed -i "s|ttyS.|ttyS$CONSOLE_DEV|g" /etc/systemd/system/serial-port-watchdog.service
systemctl daemon-reload
systemctl restart serial-port-watchdog.service
}

eval sonic_version=$(cat /etc/sonic/sonic_version.yml | grep build_version | cut -f2 -d" ")

setup_platform
setup_platform_defaults
load_platform_installer_config

program_serial_port

if [ -f /host/image-$sonic_version/platform/firsttime ]; then

if [ -z "$platform" ]; then
echo "Unknown sonic platform"
firsttime_exit
fi
Expand Down
328 changes: 328 additions & 0 deletions files/image_config/serial-port-watchdog/serial-port-watchdog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,328 @@
#!/usr/bin/env python

from __future__ import print_function, with_statement

import argparse
import logging
import logging.handlers
import os
import time
import signal
import socket
import sys

from collections import namedtuple

PRGNAME = 'serial-port-watchdog'

DEVFS_PATH = '/dev'
PROCFS_PATH = '/proc'

# According to procfs(5)
ProcStat = namedtuple( 'ProcStat', [
'pid', 'comm', 'state', 'ppid', 'pgrp', 'session', 'tty_nr', 'tpgid',
'flags', 'minflt', 'cminflt', 'majflt', 'cmajflt', 'utime', 'stime',
'cutime', 'cstime', 'priority', 'nice', 'num_threads', 'itrealvalue',
'starttime', 'vsize', 'rss', 'rsslim', 'startcode', 'endcode',
'startstack', 'kstkesp', 'kstkeip', 'signal', 'blocked', 'sigignore',
'sigcatch', 'wchan', 'nswap', 'cnswap', 'exit_signal', 'processor',
'rt_priority', 'policy', 'delayacct_blkio_ticks', 'guest_time',
'cguest_time', 'start_data', 'end_data', 'start_brk', 'arg_start',
'arg_end', 'env_start', 'env_end', 'exit_code'
] )

# According to procfs(5)
ProcIo = namedtuple( 'ProcIo', [
'rchar', 'wchar', 'syscr', 'syscw', 'read_bytes', 'write_bytes',
'cancelled_write_bytes'
] )

class Process( object ):
def __init__( self, pid, path=PROCFS_PATH ):
self.pid = pid
self.path = os.path.join( path, str( pid ) )
self.childs = []
self.parent = None

self.stat = None

self.io = None
self.stack = None
self.stackStartTime = None

def refresh( self ):
with open( os.path.join( self.path, 'stat' ) ) as f:
data = f.read().rstrip().split()
self.stat = ProcStat( *data )

def getStat( self, key=None ):
self.refresh()
return self.stat

def uid( self ):
return '%s/%s' % ( self.pid, self.stat.starttime )

def ppid( self ):
return self.stat.ppid

def name( self ):
with open( os.path.join( self.path, 'comm' ) ) as f:
return f.read().rstrip()

def getTtyForFd( self, fd ):
path = os.path.join( self.path, 'fd', str( fd ) )
if not os.path.exists( path ):
return ''
return os.readlink( path )

def getStack( self ):
with open( os.path.join( self.path, 'stack' ) ) as f:
return f.read()

def getIo( self ):
with open( os.path.join( self.path, 'io' ) ) as f:
data = [ int( l.split( ': ' )[ 1 ] ) for l in f.readlines() ]
return ProcIo( *data )

def isUsingTty( self, tty ):
return self.getTtyForFd( 0 ).endswith( tty )

def checkStuck( self, content ):
stack = self.getStack()

found = False
for match in content:
if match in stack:
found = True
break

if not found:
self.io = None
self.stack = None
self.stackStartTime = None
return 0

io = self.getIo()

if self.stack != stack or self.io != io:
self.io = io
self.stack = stack
self.stackStartTime = time.time()
return 0

return time.time() - self.stackStartTime

def __repr__( self ):
return '<Process uid=%s>' % self.uid()

class ProcessMonitor( object ):
def __init__( self, path=PROCFS_PATH ):
self.path = path
self.procs = {}
self.filters = []
self.checkers = []
self.whitelist = []

def addProcessFilter( self, func, *args ):
self.filters.append( ( func, args ) )

def addStuckChecker( self, func, *args ):
self.checkers.append( ( func, args ) )

def setWhitelist( self, whitelist ):
self.whitelist = whitelist

def shouldHandleProcess( self, proc ):
matched = False
for func, args in self.filters:
if func( proc, *args ):
matched = True
break

if not matched:
return False

name = proc.name()
for item in self.whitelist:
if item in name:
return False

return True

def getRunningPids( self ):
pids = []
for entry in os.listdir( self.path ):
if not entry.isdigit():
continue
pids.append( int( entry ) )
return pids

def killStuckProcess( self, proc, elapsed, kill, timeout ):
if not elapsed:
return

if elapsed < timeout:
if elapsed > timeout / 2:
logging.info( 'process %d seems stuck, idle for %ds, waiting '
'some more time', proc.pid, elapsed )
return

logging.warning( 'process %d has been stuck for %d seconds, killing...',
proc.pid, elapsed )
logging.info( 'process %d kernel stack\n%s', proc.pid, proc.stack )
if kill:
# XXX: SIGTERM sleep then if alive SIGKILL ?
os.kill( proc.pid, signal.SIGKILL )

def killStuckProcesses( self, kill, timeout ):
for proc in self.procs.values():
for checker, args in self.checkers:
elapsed = checker( proc, *args )
self.killStuckProcess( proc, elapsed, kill, timeout )

def updatePid( self, pid ):
p = Process( pid )

# if the process is already monitored (previously running)
r = self.procs.get( pid, None )
if r:
p.refresh()
# if the process is still running
if p.uid() == r.uid():
logging.debug( 'process %d still running', pid )
return
# or the pid was reused but the process is different
logging.debug( 'pid %d reused for another process', pid )
del self.procs[ pid ]

# check if the process is relevant for monitoring
if not self.shouldHandleProcess( p ):
return

logging.debug( 'watching process %d', pid )
p.refresh()
self.procs[ pid ] = p

def updateParenting( self ):
# clear parent and childs for monitored processes
for proc in self.procs.values():
del proc.childs[:]
proc.parent = None

# set parent and childs for monitored processes
for proc in self.procs.values():
ppid = proc.ppid()
parent = self.procs.get( ppid, None )
if parent:
proc.parent = parent
parent.childs.append( proc )

def update( self ):
pids = self.getRunningPids()

# remove defunct processes
for pid in list(self.procs.keys()):
if pid not in pids:
logging.debug( 'process %d is defunct', pid )
del self.procs[ pid ]

# create or update running processes information
for pid in pids:
try:
self.updatePid( pid )
except:
logging.warning( 'An issue occured whileupdating process %s',
pid )
raise

#self.updateParenting()

def checkRootPermissions():
if os.geteuid() != 0:
logging.error( 'You must be root to use this feature' )
sys.exit( 1 )

def getHostname():
try:
return socket.gethostname()
except:
return 'localhost'

def setupLogging( verbose=False ):
loglevel = logging.DEBUG if verbose else logging.INFO
dateFmt = '%Y-%m-%d %H:%M:%S'

log = logging.getLogger()
log.setLevel( logging.DEBUG )

logOut = logging.StreamHandler( sys.stdout )
logOut.setFormatter( logging.Formatter( '%(levelname)s: %(message)s' ) )
logOut.setLevel( loglevel )
log.addHandler( logOut )

logSys = logging.handlers.SysLogHandler()
# format to rfc5424 format
fmt = '{} {}: %(message)s'.format( getHostname(), PRGNAME )
logSys.setFormatter( logging.Formatter( fmt ) )
logSys.setLevel( logging.WARNING )
log.addHandler( logSys )
try:
# the connection to the syslog socket happens with the first message
log.info( 'Attaching to syslog' )
except:
log.warning( 'Failed open syslog' )

def listParser( value ):
if not value.strip():
return []
return value.split( ',' )

def ttyParser( dev, path=DEVFS_PATH ):
if not dev.startswith( DEVFS_PATH ):
dev = os.path.join( DEVFS_PATH, dev )
if not os.path.exists( dev ):
raise argparse.ArgumentTypeError( '%s is not a device' % dev )
return dev

def parseArgs( args ):
parser = argparse.ArgumentParser()

parser.add_argument( '-d', '--dry-run', action='store_true',
help='only print processes that would be killed' )
parser.add_argument( '-f', '--funcs', default=[ 'tty_' ], type=listParser,
help='functions to look for in the stack trace' )
parser.add_argument( '-i', '--interval', default=60, type=float,
help='interval at which to check the procfs' )
parser.add_argument( '-k', '--timeout', default=3600, type=float,
help='timeout for which a process gets killed' )
parser.add_argument( '-t', '--tty', default='ttyS0', type=ttyParser,
help='tty to check for stuck process' )
parser.add_argument( '-v', '--verbose', action='store_true',
help='print all debug messages' )
parser.add_argument( '-w', '--whitelist', default=[ 'agetty' ], type=listParser,
help='whitelist programs that should never be killed' )

return parser.parse_args( args )

def main( args ):
args = parseArgs( args )

setupLogging( args.verbose )
checkRootPermissions()

m = ProcessMonitor()
m.addProcessFilter( Process.isUsingTty, args.tty )
m.addStuckChecker( Process.checkStuck, args.funcs )
m.setWhitelist( args.whitelist )

while True:
logging.debug( 'updating processes' )
m.update()
m.killStuckProcesses( kill=( not args.dry_run ), timeout=args.timeout )
time.sleep( args.interval )

return 0

if __name__ == '__main__':
sys.exit( main( sys.argv[ 1: ] ) )

Loading

0 comments on commit bb6ff62

Please sign in to comment.