1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
|
#!/bin/sh
# PCP QA Test No. 1483
# look for bad syslog entries
#
# Copyright (c) 2024 Ken McDonell. All Rights Reserved.
#
# check-group-exclude: pmlogger_check pmlogger_daily
#
if [ $# -eq 0 ]
then
seq=`basename $0`
echo "QA output created by $seq"
else
# use $seq from caller, unless not set
[ -n "$seq" ] || seq=`basename $0`
echo "QA output created by `basename $0` $*"
fi
# get standard environment, filters and checks
. ./common.product
. ./common.filter
. ./common.check
which journalctl >/dev/null 2>&1 || _notrun "no journalctl executable installed"
[ "$PCPQA_SYSTEMD" = no ] && _notrun "we're not using systemd here, so journalctl not useful"
_cleanup()
{
cd $here
$sudo rm -rf $tmp $tmp.*
}
status=0 # success is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
# cull expected lines from journalctl
#
_filter()
{
# the first block are lines that are OK for any service,
# then the case ... esac deals with the per-service
# culling
#
# and there is some undiagnosed issue when QA is running that
# dinks with /var/log/pcp/NOTICES ... triage has failed and
# the file always ends up with the correct permissions after
# QA is done
#
sed \
-e '/^-- Boot .* --$/d' \
-e '/^-- Reboot --$/d' \
-e '/^-- No entries --$/d' \
-e '/^-- Journal begins at /d' \
-e '/^-- Logs begin at /d' \
-e '/: End[: ]/d' \
-e '/error while loading shared libraries: libpcp/d' \
-e '/rc\[[0-9]*]: .*pmpost:.* cannot open .*NOTICES/d' \
-e '/rc\[[0-9]*]: .*pmpost: unposted message:/d' \
| case "$1"
in
pmcd)
sed \
-e '/ Installing .* PMDA /d' \
-e '/ Removing .* PMDA /d' \
-e '/ Rebuilding PMNS /d' \
-e '/pmcd\[[0-9]*]: .* pmdaopenmetrics([0-9]*) Info:/d' \
-e '/pmcd\[[0-9]*]: .* pmdaopentelemetry([0-9]*) Info:/d' \
-e '/pmcd\[[0-9]*]: \.*$/d' \
-e '/pmcd\[[0-9]*]: .*\.\.done$/d' \
-e '/pmcd\[[0-9]*]: Terminated/d' \
-e '/root\[[0-9]*]: pmcd_wait failed in /d' \
-e '/pmcd\[[0-9]*]: _pmda_setup: Interrupted!/d' \
-e '/pmcd\[[0-9]*]: _pmda_setup_cleanup: reset \.NeedInstall/d' \
-e '/pmcd\[[0-9]*]: .*\/pmcd: .* cannot start pmcd/d' \
-e '/systemctl\[[0-9]*]: .* pmcd\.service changed on disk/d' \
#end
;;
pmie*)
# cull regular pmie rule firing ....
# ... pcp-pmie[3330341]: Severe ...
# and these lines from qa/115
# ... rc[24566]: /etc/init.d/rc:
# ... rc[24566]: Error: PCP inference engine control file $PCP_PMIECONTROL_PATH ("/etc/pcp/pmie/control")
# ... rc[24566]: is missing! Cannot start any Performance Co-Pilot inference engine(s).
# ... rc[96813]: /etc/pcp/pmie/rc: Warning: Performance Co-Pilot Inference Engine (pmie) not permanently enabled.
# ... rc[96813]: To enable pmie, run the following as root:
# ... rc[96813]: # /bin/systemctl enable pmie.service
# and qa/575 seems capable of tripping this one
# ... pmie_farm[3016551]: End:
# and pmie_check will fail during PCP builds
# and this strange one but only on vm03
# ... vm03.localdomain pmiectl[1013253]: mount: write error
# and then just random QA noise
#
sed \
-e '/ pcp-pmie\[/d' \
-e '/rc\[[0-9]*]: .*\/rc:$/d' \
-e '/pmie\[[0-9]*]: '"`echo "$PCP_SERVICES_DIR" | sed -e 's@/@\\\\/@g'`"'\/pmie:$/d' \
-e '/rc\[[0-9]*]: Error: .* \$PCP_PMIECONTROL_PATH/d' \
-e '/pmie\[[0-9]*]: Error: .* \$PCP_PMIECONTROL_PATH/d' \
-e '/rc\[[0-9]*]: .*is missing!/d' \
-e '/pmie\[[0-9]*]: .*is missing!/d' \
-e '/rc\[[0-9]*]: .*not permanently enabled/d' \
-e '/pmie\[[0-9]*]: .*not permanently enabled/d' \
-e '/rc\[[0-9]*]: .*run the following as root:/d' \
-e '/pmie\[[0-9]*]: .*run the following as root:/d' \
-e '/rc\[[0-9]*]: .*systemctl enable pmie\.service/d' \
-e '/pmie\[[0-9]*]: .*systemctl enable pmie\.service/d' \
-e '/rc\[[0-9]*]: Terminated$/d' \
-e '/pmie_farm\[[0-9]*]: End:/d' \
-e '/pmie_check failed - see .*\/pmie_check.log/d' \
-e '/pmie_daily failed - see .*\/pmie_daily.log/d' \
-e '/pmiectl\[[0-9]*]: .* (localhost) defined multiple times,/d' \
-e '/ pmiectl\[[0-9]*]: mount: write error/d' \
-e '/pcp\[[0-9]*]: pmie_check start failed in/d' \
-e "/pmiectl\[[0-9]*]: sed: couldn't flush stdout/d" \
# end
;;
pmlogger*)
# sudo babble
# and pmlogger_check will fail during PCP builds
# and pmlogger_daily will fail during PCP builds
# and pmlogger_janitor will fail during PCP builds
# and qa/1210 and qa/1213
# and lock collision with pmlogctl from pmlogger_farm_check
# and this strange one but only on bozo and vm03
# ... bozo.localdomain pmlogctl[1013253]: mount: write error
# and fallout from qa/1213 if pmlogger_farm_check goes off
# concurrently
# and then just random QA noise
sed \
-e '/sudo\[[0-9]*]: /d' \
-e '/pmlogger_check failed - see .*\/pmlogger_check.log/d' \
-e '/pmlogger_daily failed - see .*\/pmlogger_daily.log/d' \
-e '/pmlogger_daily failed - see .*\/pmlogger_daily-K.log/d' \
-e '/pmlogger_janitor failed - see .*\/pmlogger_janitor.log/d' \
-e '/pmlogctl\[[0-9]*]: .* failed to start for host no\.such\.host\.pcp\.io/d' \
-e '/pmlogctl\[[0-9]*]: .* is another pmlogctl job running concurrently?/d' \
-e '/pmlogctl\[[0-9]*]: .*\/pmlogger\/lock$/d' \
-e '/pmlogctl\[[0-9]*]: .* failed to acquire exclusive lock/d' \
-e '/pmlogctl\[[0-9]*]: [0-9][0-9]* pmlogctl/d' \
-e '/pmlogctl\[[0-9]*]: Terminated$/d' \
-e '/ pmlogctl\[[0-9]*]: mount: write error/d' \
-e '/vm03 pmlogctl\[[0-9]*]: mount: write error/d' \
-e '/pmlogctl\[[0-9]*]: .* (localhost) defined multiple times,/d' \
-e '/rc\[[0-9]*]: Terminated$/d' \
-e '/pmlogger\[[0-9]*]: Terminated$/d' \
-e '/rc\[[0-9]*]: .*not permanently enabled\./d' \
-e '/rc\[[0-9]*]: .*run the following as root:/d' \
-e '/rc\[[0-9]*]: .*systemctl enable pmlogger\.service/d' \
-e '/rc\[[0-9]*]: .*\/pmsignal: .* No such process/d' \
-e '/pcp\[[0-9]*]: pmlogger_check_failed in/d' \
# end
;;
*)
cat
;;
esac
}
# real QA test starts here
for svc in \
pmcd pmfind pmie pmie_check pmie_daily pmie_farm pmie_farm_check \
pmlogger pmlogger_check pmlogger_daily pmlogger_farm pmlogger_farm_check \
pmproxy
do
echo
echo "=== $svc ==="
# want entries for the past 24 hours, but note we need "backwards"
# MM-DD USA date format
#
$sudo journalctl --no-pager --since="`pmdate -1d '%Y-%m-%d %H:%M:%S'`" _SYSTEMD_UNIT=$svc.service 2>&1 \
| _filter $svc
done
# additional diagnositics for stuff we don't understand!
#
# on ubuntu1804-container in CI
# Mar 04 19:39:45 cd959fdf2242 pmcd[1179757]: /usr/lib/pcp/bin/pcp-reboot-init: 44: [: -ne: unexpected operator
#
which id >>$seq_full 2>&1
id -u >>$seq_full 2>&1
$sudo id -u >>$seq_full 2>&1
# success, all done
exit
|