1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
|
#!/usr/bin/python
# Copyright (C) 2008, 2014 Red Hat Inc.
#
# This file is part of systemtap, and is free software. You can
# redistribute it and/or modify it under the terms of the GNU General
# Public License (GPL); either version 2, or (at your option) any
# later version.
# This script monitors a remote system that is running the kprobes
# test. If several consecutive 'ping's fail, the system is rebooted.
#
# This script takes as an argument a config filename, whose contents
# should look like the following:
#
# config_opts['system_name'] = "SYSTEM_NAME"
# config_opts['restart_cmds'] = [
# 'CMD1',
# 'CMD2',
# ]
#
# As an example, here is a config file used when monitoring a kvm
# instance:
#
# config_opts['system_name'] = "dhcp-148"
# config_opts['restart_cmds'] = [
# 'sudo virsh destroy kvm-rawhide-64-1',
# 'sudo virsh start kvm-rawhide-64-1',
# ]
#
# Here's a config file used then monitoring a beaker system
# (https://beaker-project.org/):
#
# config_opts['system_name'] = "ibm-z10-38"
# config_opts['restart_cmds'] = [
# 'bkr system-power --action reboot ibm-z10-38'
# ]
import sys
import os
import time
if len(sys.argv) != 2:
print >>sys.stderr, "Usage: %s config_file" % sys.argv[0]
sys.exit(1)
cfg = sys.argv[1]
# Read in the config file
if not os.path.exists(cfg):
print >>sys.stderr, ("Could not find required config file: %s" % cfg)
sys.exit(1)
print "Reading config file %s..." % cfg
config_opts = dict()
execfile(cfg)
if not config_opts.has_key('system_name'):
print >>sys.stderr, "Missing required config opt 'system_name'"
sys.exit(1)
if not config_opts.has_key('restart_cmds'):
print >>sys.stderr, "Missing required config opt 'restart_cmds'"
sys.exit(1)
errors = 0
while 1:
rc = os.system("ping -c 1 %s" % config_opts['system_name'])
# If ping worked, system is still up and running. Wait a minute
# and try again.
if os.WEXITSTATUS(rc) == 0:
time.sleep(60)
errors = 0
# If the ping failed, increase the error count. If we've got 3
# consecutive errors, assume the machine has crashed and restart
# it.
else:
errors += 1
if errors < 3:
time.sleep(30)
else:
print >>sys.stderr, "Restarting %s..." % config_opts['system_name']
# Run each restart command
for cmd in config_opts['restart_cmds']:
print >>sys.stderr, "Running '%s'..." % cmd
os.system(cmd)
# Sleep for 5 minutes to give the system a chance to boot
print >>sys.stderr, "Sleeping for 5 minutes..."
time.sleep(5 * 60)
errors = 0
|