-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[service] introducing serial port watchdog service (#1743)
* [rc.local] refactor platform identification code to separate function Signed-off-by: Ying Xie <ying.xie@microsoft.com> * [rc.local] infrastructure to take action according to installer.conf * [serial port watchdog] add service to watch serial port processes Monitor serial port processes. Kill ones stuck for too long. Signed-off-by: Ying Xie <ying.xie@microsoft.com> * [rc.local] start watchdog on serial port specified by installer.conf Signed-off-by: Ying Xie <ying.xie@microsoft.com>
- Loading branch information
Showing
4 changed files
with
385 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
328 changes: 328 additions & 0 deletions
328
files/image_config/serial-port-watchdog/serial-port-watchdog.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,328 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import print_function, with_statement | ||
|
||
import argparse | ||
import logging | ||
import logging.handlers | ||
import os | ||
import time | ||
import signal | ||
import socket | ||
import sys | ||
|
||
from collections import namedtuple | ||
|
||
PRGNAME = 'serial-port-watchdog' | ||
|
||
DEVFS_PATH = '/dev' | ||
PROCFS_PATH = '/proc' | ||
|
||
# According to procfs(5) | ||
ProcStat = namedtuple( 'ProcStat', [ | ||
'pid', 'comm', 'state', 'ppid', 'pgrp', 'session', 'tty_nr', 'tpgid', | ||
'flags', 'minflt', 'cminflt', 'majflt', 'cmajflt', 'utime', 'stime', | ||
'cutime', 'cstime', 'priority', 'nice', 'num_threads', 'itrealvalue', | ||
'starttime', 'vsize', 'rss', 'rsslim', 'startcode', 'endcode', | ||
'startstack', 'kstkesp', 'kstkeip', 'signal', 'blocked', 'sigignore', | ||
'sigcatch', 'wchan', 'nswap', 'cnswap', 'exit_signal', 'processor', | ||
'rt_priority', 'policy', 'delayacct_blkio_ticks', 'guest_time', | ||
'cguest_time', 'start_data', 'end_data', 'start_brk', 'arg_start', | ||
'arg_end', 'env_start', 'env_end', 'exit_code' | ||
] ) | ||
|
||
# According to procfs(5) | ||
ProcIo = namedtuple( 'ProcIo', [ | ||
'rchar', 'wchar', 'syscr', 'syscw', 'read_bytes', 'write_bytes', | ||
'cancelled_write_bytes' | ||
] ) | ||
|
||
class Process( object ): | ||
def __init__( self, pid, path=PROCFS_PATH ): | ||
self.pid = pid | ||
self.path = os.path.join( path, str( pid ) ) | ||
self.childs = [] | ||
self.parent = None | ||
|
||
self.stat = None | ||
|
||
self.io = None | ||
self.stack = None | ||
self.stackStartTime = None | ||
|
||
def refresh( self ): | ||
with open( os.path.join( self.path, 'stat' ) ) as f: | ||
data = f.read().rstrip().split() | ||
self.stat = ProcStat( *data ) | ||
|
||
def getStat( self, key=None ): | ||
self.refresh() | ||
return self.stat | ||
|
||
def uid( self ): | ||
return '%s/%s' % ( self.pid, self.stat.starttime ) | ||
|
||
def ppid( self ): | ||
return self.stat.ppid | ||
|
||
def name( self ): | ||
with open( os.path.join( self.path, 'comm' ) ) as f: | ||
return f.read().rstrip() | ||
|
||
def getTtyForFd( self, fd ): | ||
path = os.path.join( self.path, 'fd', str( fd ) ) | ||
if not os.path.exists( path ): | ||
return '' | ||
return os.readlink( path ) | ||
|
||
def getStack( self ): | ||
with open( os.path.join( self.path, 'stack' ) ) as f: | ||
return f.read() | ||
|
||
def getIo( self ): | ||
with open( os.path.join( self.path, 'io' ) ) as f: | ||
data = [ int( l.split( ': ' )[ 1 ] ) for l in f.readlines() ] | ||
return ProcIo( *data ) | ||
|
||
def isUsingTty( self, tty ): | ||
return self.getTtyForFd( 0 ).endswith( tty ) | ||
|
||
def checkStuck( self, content ): | ||
stack = self.getStack() | ||
|
||
found = False | ||
for match in content: | ||
if match in stack: | ||
found = True | ||
break | ||
|
||
if not found: | ||
self.io = None | ||
self.stack = None | ||
self.stackStartTime = None | ||
return 0 | ||
|
||
io = self.getIo() | ||
|
||
if self.stack != stack or self.io != io: | ||
self.io = io | ||
self.stack = stack | ||
self.stackStartTime = time.time() | ||
return 0 | ||
|
||
return time.time() - self.stackStartTime | ||
|
||
def __repr__( self ): | ||
return '<Process uid=%s>' % self.uid() | ||
|
||
class ProcessMonitor( object ): | ||
def __init__( self, path=PROCFS_PATH ): | ||
self.path = path | ||
self.procs = {} | ||
self.filters = [] | ||
self.checkers = [] | ||
self.whitelist = [] | ||
|
||
def addProcessFilter( self, func, *args ): | ||
self.filters.append( ( func, args ) ) | ||
|
||
def addStuckChecker( self, func, *args ): | ||
self.checkers.append( ( func, args ) ) | ||
|
||
def setWhitelist( self, whitelist ): | ||
self.whitelist = whitelist | ||
|
||
def shouldHandleProcess( self, proc ): | ||
matched = False | ||
for func, args in self.filters: | ||
if func( proc, *args ): | ||
matched = True | ||
break | ||
|
||
if not matched: | ||
return False | ||
|
||
name = proc.name() | ||
for item in self.whitelist: | ||
if item in name: | ||
return False | ||
|
||
return True | ||
|
||
def getRunningPids( self ): | ||
pids = [] | ||
for entry in os.listdir( self.path ): | ||
if not entry.isdigit(): | ||
continue | ||
pids.append( int( entry ) ) | ||
return pids | ||
|
||
def killStuckProcess( self, proc, elapsed, kill, timeout ): | ||
if not elapsed: | ||
return | ||
|
||
if elapsed < timeout: | ||
if elapsed > timeout / 2: | ||
logging.info( 'process %d seems stuck, idle for %ds, waiting ' | ||
'some more time', proc.pid, elapsed ) | ||
return | ||
|
||
logging.warning( 'process %d has been stuck for %d seconds, killing...', | ||
proc.pid, elapsed ) | ||
logging.info( 'process %d kernel stack\n%s', proc.pid, proc.stack ) | ||
if kill: | ||
# XXX: SIGTERM sleep then if alive SIGKILL ? | ||
os.kill( proc.pid, signal.SIGKILL ) | ||
|
||
def killStuckProcesses( self, kill, timeout ): | ||
for proc in self.procs.values(): | ||
for checker, args in self.checkers: | ||
elapsed = checker( proc, *args ) | ||
self.killStuckProcess( proc, elapsed, kill, timeout ) | ||
|
||
def updatePid( self, pid ): | ||
p = Process( pid ) | ||
|
||
# if the process is already monitored (previously running) | ||
r = self.procs.get( pid, None ) | ||
if r: | ||
p.refresh() | ||
# if the process is still running | ||
if p.uid() == r.uid(): | ||
logging.debug( 'process %d still running', pid ) | ||
return | ||
# or the pid was reused but the process is different | ||
logging.debug( 'pid %d reused for another process', pid ) | ||
del self.procs[ pid ] | ||
|
||
# check if the process is relevant for monitoring | ||
if not self.shouldHandleProcess( p ): | ||
return | ||
|
||
logging.debug( 'watching process %d', pid ) | ||
p.refresh() | ||
self.procs[ pid ] = p | ||
|
||
def updateParenting( self ): | ||
# clear parent and childs for monitored processes | ||
for proc in self.procs.values(): | ||
del proc.childs[:] | ||
proc.parent = None | ||
|
||
# set parent and childs for monitored processes | ||
for proc in self.procs.values(): | ||
ppid = proc.ppid() | ||
parent = self.procs.get( ppid, None ) | ||
if parent: | ||
proc.parent = parent | ||
parent.childs.append( proc ) | ||
|
||
def update( self ): | ||
pids = self.getRunningPids() | ||
|
||
# remove defunct processes | ||
for pid in list(self.procs.keys()): | ||
if pid not in pids: | ||
logging.debug( 'process %d is defunct', pid ) | ||
del self.procs[ pid ] | ||
|
||
# create or update running processes information | ||
for pid in pids: | ||
try: | ||
self.updatePid( pid ) | ||
except: | ||
logging.warning( 'An issue occured whileupdating process %s', | ||
pid ) | ||
raise | ||
|
||
#self.updateParenting() | ||
|
||
def checkRootPermissions(): | ||
if os.geteuid() != 0: | ||
logging.error( 'You must be root to use this feature' ) | ||
sys.exit( 1 ) | ||
|
||
def getHostname(): | ||
try: | ||
return socket.gethostname() | ||
except: | ||
return 'localhost' | ||
|
||
def setupLogging( verbose=False ): | ||
loglevel = logging.DEBUG if verbose else logging.INFO | ||
dateFmt = '%Y-%m-%d %H:%M:%S' | ||
|
||
log = logging.getLogger() | ||
log.setLevel( logging.DEBUG ) | ||
|
||
logOut = logging.StreamHandler( sys.stdout ) | ||
logOut.setFormatter( logging.Formatter( '%(levelname)s: %(message)s' ) ) | ||
logOut.setLevel( loglevel ) | ||
log.addHandler( logOut ) | ||
|
||
logSys = logging.handlers.SysLogHandler() | ||
# format to rfc5424 format | ||
fmt = '{} {}: %(message)s'.format( getHostname(), PRGNAME ) | ||
logSys.setFormatter( logging.Formatter( fmt ) ) | ||
logSys.setLevel( logging.WARNING ) | ||
log.addHandler( logSys ) | ||
try: | ||
# the connection to the syslog socket happens with the first message | ||
log.info( 'Attaching to syslog' ) | ||
except: | ||
log.warning( 'Failed open syslog' ) | ||
|
||
def listParser( value ): | ||
if not value.strip(): | ||
return [] | ||
return value.split( ',' ) | ||
|
||
def ttyParser( dev, path=DEVFS_PATH ): | ||
if not dev.startswith( DEVFS_PATH ): | ||
dev = os.path.join( DEVFS_PATH, dev ) | ||
if not os.path.exists( dev ): | ||
raise argparse.ArgumentTypeError( '%s is not a device' % dev ) | ||
return dev | ||
|
||
def parseArgs( args ): | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument( '-d', '--dry-run', action='store_true', | ||
help='only print processes that would be killed' ) | ||
parser.add_argument( '-f', '--funcs', default=[ 'tty_' ], type=listParser, | ||
help='functions to look for in the stack trace' ) | ||
parser.add_argument( '-i', '--interval', default=60, type=float, | ||
help='interval at which to check the procfs' ) | ||
parser.add_argument( '-k', '--timeout', default=3600, type=float, | ||
help='timeout for which a process gets killed' ) | ||
parser.add_argument( '-t', '--tty', default='ttyS0', type=ttyParser, | ||
help='tty to check for stuck process' ) | ||
parser.add_argument( '-v', '--verbose', action='store_true', | ||
help='print all debug messages' ) | ||
parser.add_argument( '-w', '--whitelist', default=[ 'agetty' ], type=listParser, | ||
help='whitelist programs that should never be killed' ) | ||
|
||
return parser.parse_args( args ) | ||
|
||
def main( args ): | ||
args = parseArgs( args ) | ||
|
||
setupLogging( args.verbose ) | ||
checkRootPermissions() | ||
|
||
m = ProcessMonitor() | ||
m.addProcessFilter( Process.isUsingTty, args.tty ) | ||
m.addStuckChecker( Process.checkStuck, args.funcs ) | ||
m.setWhitelist( args.whitelist ) | ||
|
||
while True: | ||
logging.debug( 'updating processes' ) | ||
m.update() | ||
m.killStuckProcesses( kill=( not args.dry_run ), timeout=args.timeout ) | ||
time.sleep( args.interval ) | ||
|
||
return 0 | ||
|
||
if __name__ == '__main__': | ||
sys.exit( main( sys.argv[ 1: ] ) ) | ||
|
Oops, something went wrong.