1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
|
"""
Systemd test
"""
import contextlib
import logging
import re
import time
from teuthology.orchestra import run
from teuthology.misc import reconnect, get_first_mon, wait_until_healthy
log = logging.getLogger(__name__)
def _remote_service_status(remote, service):
status = remote.sh('sudo systemctl status %s' % service,
check_status=False)
return status
@contextlib.contextmanager
def task(ctx, config):
"""
- tasks:
ceph-deploy:
systemd:
Test ceph systemd services can start, stop and restart and
check for any failed services and report back errors
"""
for remote, roles in ctx.cluster.remotes.items():
remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
'grep', 'ceph'])
units = remote.sh('sudo systemctl list-units | grep ceph',
check_status=False)
log.info(units)
if units.find('failed'):
log.info("Ceph services in failed state")
# test overall service stop and start using ceph.target
# ceph.target tests are meant for ceph systemd tests
# and not actual process testing using 'ps'
log.info("Stopping all Ceph services")
remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
status = _remote_service_status(remote, 'ceph.target')
log.info(status)
log.info("Checking process status")
ps_eaf = remote.sh('sudo ps -eaf | grep ceph')
if ps_eaf.find('Active: inactive'):
log.info("Successfully stopped all ceph services")
else:
log.info("Failed to stop ceph services")
log.info("Starting all Ceph services")
remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target'])
status = _remote_service_status(remote, 'ceph.target')
log.info(status)
if status.find('Active: active'):
log.info("Successfully started all Ceph services")
else:
log.info("info", "Failed to start Ceph services")
ps_eaf = remote.sh('sudo ps -eaf | grep ceph')
log.info(ps_eaf)
time.sleep(4)
# test individual services start stop
name = remote.shortname
mon_name = 'ceph-mon@' + name + '.service'
mds_name = 'ceph-mds@' + name + '.service'
mgr_name = 'ceph-mgr@' + name + '.service'
mon_role_name = 'mon.' + name
mds_role_name = 'mds.' + name
mgr_role_name = 'mgr.' + name
m_osd = re.search('--id (\d+) --setuser ceph', ps_eaf)
if m_osd:
osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1))
remote.run(args=['sudo', 'systemctl', 'status',
osd_service])
remote.run(args=['sudo', 'systemctl', 'stop',
osd_service])
time.sleep(4) # immediate check will result in deactivating state
status = _remote_service_status(remote, osd_service)
log.info(status)
if status.find('Active: inactive'):
log.info("Successfully stopped single osd ceph service")
else:
log.info("Failed to stop ceph osd services")
remote.sh(['sudo', 'systemctl', 'start', osd_service])
time.sleep(4)
if mon_role_name in roles:
remote.run(args=['sudo', 'systemctl', 'status', mon_name])
remote.run(args=['sudo', 'systemctl', 'stop', mon_name])
time.sleep(4) # immediate check will result in deactivating state
status = _remote_service_status(remote, mon_name)
if status.find('Active: inactive'):
log.info("Successfully stopped single mon ceph service")
else:
log.info("Failed to stop ceph mon service")
remote.run(args=['sudo', 'systemctl', 'start', mon_name])
time.sleep(4)
if mgr_role_name in roles:
remote.run(args=['sudo', 'systemctl', 'status', mgr_name])
remote.run(args=['sudo', 'systemctl', 'stop', mgr_name])
time.sleep(4) # immediate check will result in deactivating state
status = _remote_service_status(remote, mgr_name)
if status.find('Active: inactive'):
log.info("Successfully stopped single ceph mgr service")
else:
log.info("Failed to stop ceph mgr service")
remote.run(args=['sudo', 'systemctl', 'start', mgr_name])
time.sleep(4)
if mds_role_name in roles:
remote.run(args=['sudo', 'systemctl', 'status', mds_name])
remote.run(args=['sudo', 'systemctl', 'stop', mds_name])
time.sleep(4) # immediate check will result in deactivating state
status = _remote_service_status(remote, mds_name)
if status.find('Active: inactive'):
log.info("Successfully stopped single ceph mds service")
else:
log.info("Failed to stop ceph mds service")
remote.run(args=['sudo', 'systemctl', 'start', mds_name])
time.sleep(4)
# reboot all nodes and verify the systemd units restart
# workunit that runs would fail if any of the systemd unit doesnt start
ctx.cluster.run(args='sudo reboot', wait=False, check_status=False)
# avoid immediate reconnect
time.sleep(120)
reconnect(ctx, 480) # reconnect all nodes
# for debug info
ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
'grep', 'ceph'])
# wait for HEALTH_OK
mon = get_first_mon(ctx, config)
(mon_remote,) = ctx.cluster.only(mon).remotes.keys()
wait_until_healthy(ctx, mon_remote, use_sudo=True)
yield
|