1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
|
#!/bin/sh
# PCP QA Test No. 1201
# pmlogger_check for primary logger and pmcd may not be running (yet)
# - systemd and package install failure case
#
# This is the multiple pmlogger version. See qa/1200 for the single
# pmlogger version.
#
# Copyright (c) 2020 Ken McDonell. All Rights Reserved.
#
seq=`basename $0`
echo "QA output created by $seq"
# get standard environment, filters and checks
. ./common.product
. ./common.filter
. ./common.check
_cleanup()
{
cd $here
if $_needclean
then
[ -f ${PCP_PMLOGGERCONTROL_PATH}.d/$remote ] \
&& $sudo rm -f ${PCP_PMLOGGERCONTROL_PATH}.d/$remote
export PMLOGGER_CHECK_SKIP_JANITOR=no
_service pmcd restart 2>&1 | _filter_pcp_restart
_wait_for_pmcd
_service pmlogger restart 2>&1 | _filter_pcp_restart
_wait_for_pmlogger
_needclean=false
fi
$sudo rm -rf $tmp $tmp.*
}
# borrowed from _wait_for_pmlogger
#
_my_wait_for_pmlogger()
{
# 6 seconds default seems like a reasonable max time to get going
_maxdelay=6
_dir_hostname=`hostname || echo localhost`
_logfile="$PCP_ARCHIVE_DIR/$_dir_hostname/pmlogger.log"
_i=0
_dead=true
while [ $_i -lt $_maxdelay ]
do
if $sudo -u $PCP_USER pmlc -P </dev/null 2>&1 \
| tee $tmp.err \
| grep -E "Connection refused|Transport endpoint is not connected" >/dev/null
then
sleep 1
_i=`expr $_i + 1`
else
# pmlogger socket has been set up ...
_dead=false
# give pmlogger a chance to detect that pmlc has gone away
# so the port is free
sleep 1
break
fi
done
if $_dead
then
echo "now: `date`"
echo "Oops ... primary pmlogger failed to start after $_maxdelay seconds"
echo "pmlogger log ($_logfile) ..."
if [ -f $_logfile ]
then
cat $_logfile
else
echo "Not created ... this is good as it means pmlogger_check noticed"
fi
echo "pmlc attempt ..."
[ -f $tmp.err ] && cat $tmp.err
fi
}
_filter()
{
tee -a $seq_full \
| sed \
-e '/^now: /s/ .*/ DATE/' \
-e "s@$PCP_ARCHIVE_DIR@PCP_ARCHIVE_DIR@" \
-e "s@$_dir_hostname@HOSTNAME@" \
| _filter_pmlogger_log
}
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
remote=`./getpmcdhosts -L -n 1`
[ -z "$remote" ] && _notrun "Cannot find remote host running pmcd"
echo "remote=\"$remote\"" >>$seq_full
_needclean=true
myhost=`hostname`
cat <<End-of-File >$tmp.config
log mandatory on once { pmcd }
log advisory on default { kernel.all.cpu }
End-of-File
cat <<End-of-File >$tmp.control
# Installed by PCP QA test $seq on `date`
\$version=1.1
$remote n n PCP_ARCHIVE_DIR/$remote -c $tmp.config
End-of-File
$sudo cp $tmp.control ${PCP_PMLOGGERCONTROL_PATH}.d/$remote
# real QA test starts here
echo "[`date`] initially" >>$seq_full
$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]PID|/[p](mcd|mlogger)' >>$seq_full
# stop pmcd, and all pmloggers
#
echo "[`date`] pcp stop" >>$seq_full
if ! _service pmlogger stop 2>&1; then _exit 1; fi \
| _filter_pcp_stop
_wait_pmlogger_end || _exit 1
if ! _service pmcd stop 2>&1; then _exit 1; fi \
| _filter_pcp_stop
_wait_pmcd_end || _exit 1
$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]PID|/[p](mcd|mlogger)' >>$seq_full
# from here on, don't use any "_service" wrapper ... we need to dodge
# any linking of the services and starting stuff under the covers
#
_dir_hostname=`hostname || echo localhost`
_logfile="$PCP_ARCHIVE_DIR/$_dir_hostname/pmlogger.log"
$sudo rm -f $_logfile
echo "[`date`]" >>$seq_full
echo "pmcd not running, expect this to timeout" | tee -a $seq_full
$sudo $PCP_RC_DIR/pmlogger start 2>&1 | _filter_pcp_start
_my_wait_for_pmlogger | _filter
echo "[`date`]" >>$seq_full
$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]PID|/[p](mcd|mlogger)' >>$seq_full
$sudo $PCP_RC_DIR/pmcd start 2>&1 | _filter_pcp_start
_wait_for_pmcd || _exit 1
# the "rc" pmlogger script calls pmlogger_check and now pmlogger_check
# uses pmlogger_janitor to kill off any "lost" pmlogger's, we don't want
# pmlogger_janitor to kill the primary pmlogger we're about to start ...
#
export PMLOGGER_CHECK_SKIP_JANITOR=yes
$sudo rm -f $_logfile
echo "[`date`]" >>$seq_full
echo "pmcd running, expect this to work" | tee -a $seq_full
$sudo $PCP_RC_DIR/pmlogger start 2>&1 | _filter_pcp_start
_wait_for_pmlogger || _exit 1
echo "[`date`]" >>$seq_full
$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]PID|/[p](mcd|mlogger)' >>$seq_full
pminfo -f pmcd.pmlogger.pmcd_host | tee -a $seq_full >$tmp.tmp
if grep '"primary"' $tmp.tmp >/dev/null
then
echo "Found primary pmlogger"
else
echo "Error: primary pmlogger missing"
cat $tmp.tmp
pcp
fi
if grep '"'"$remote"'"' $tmp.tmp >/dev/null
then
echo "Found non-primary pmlogger"
else
# getpmcdhosts may have returned a FQDN, but hostname() on the remote
# host may return an abbreviated name which is the hostname we see
# in pmcd.pmlogger.pmcd_host ... try that
#
r=`echo $remote | sed -e 's/\..*//'`
if grep '"'"$r"'"' $tmp.tmp >/dev/null
then
echo "Found non-primary pmlogger"
else
echo "Error: non-primary pmlogger missing"
cat $tmp.tmp
pcp
fi
fi
# success, all done
status=0
exit
|