File: check.callback.sample

package info (click to toggle)
pcp 6.3.8-1
  • links: PTS
  • area: main
  • in suites: forky, sid, trixie
  • size: 235,180 kB
  • sloc: ansic: 1,253,622; sh: 173,998; xml: 160,490; cpp: 83,331; python: 20,482; perl: 18,302; yacc: 6,886; makefile: 2,955; lex: 2,862; fortran: 60; java: 52
file content (338 lines) | stat: -rwxr-xr-x 9,174 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
#!/bin/sh
#
# Trying to track down assorted transient QA environment issues.
#
# Run from check after each test completes, but you need to ensure
# the script is called check.callback and it is executable.
#
# Usage: check.callback [--precheck] [seq#]
#
# Note: This is a SAMPLE script, you should either
#	(a) $ ln check.callback.sample check.callback
#           or probably better to ensure tracking any upstream changes
#           $ ln -s check.callback.sample check.callback
#	    to have it activated "as is", or
#	(b) $ cp check.callback.sample check.callback
#	    and then edit check.callback to suit local needs ... see
#	    in particular the CONFIGURE-ME comments below
#

if [ -z "$PCP_LOG_DIR" ]
then
    # running standalone (not called from check), so need to populate
    # environment
    # (logic copied from common.rc)
    #
    # source the PCP configuration environment variables
    if [ -r $PCP_DIR/etc/pcp.env ]
    then
	. $PCP_DIR/etc/pcp.env
    else
	echo "Error: unable to read $PCP_DIR/etc/pcp.env!" >&2
	exit 1
    fi
fi

# Security Enhanced Linux log messages
audit=/var/log/audit/audit.log
sudo=`which sudo`

# Did pmlogger_daily get run as expected?
#
# Do this at most once each time check is run ... if it fails it will
# fail for ALL QA tests, which is not helpful.  The file
# $here/check.onetrip is created in "common", so it will be there at
# the start of any check (or recheck or check-flakey) run.
#
if [ -f $here/check.onetrip ]
then
    # But don't bother if it is just after midnight!
    #
    case `pmdate %H:%M`
    in
	00:[012]*)
	    ;;
	*)
	    yesterday=`pmdate -1d %Y%m%d`
	    host=`hostname`
	    case $host
	    in
		# CONFIGURE-ME
		# - may need to add extra hosts here to enable this test
		#
		bozo|bozo-vm|vm01|vm03|vm36)
			if [ ! -f $PCP_LOG_DIR/pmlogger/$host/$yesterday.meta.xz -a ! -f $PCP_LOG_DIR/pmlogger/$host/$yesterday.meta ]
			then
			    # after a virgin install, or cleaning out the directory,
			    # or a hostname change, there will be NO $yesterday files
			    # at all, so be quiet if this is the case ...
			    #
			    nfile=`find  $PCP_LOG_DIR/pmlogger/$host -type f | grep "^$PCP_LOG_DIR/pmlogger/$host/$yesterday.*\\.meta" | wc -l | sed -e 's/ //g'`
			    if [ "$nfile" -gt 0 ]
			    then
				echo "check.callback: fail: pmlogger_daily not well!"
				echo "Missing $yesterday archive but $nfile component archives ..."
				ls -l $PCP_LOG_DIR/pmlogger/$host/$yesterday.*
			    fi
			fi
			;;
	    esac
	    ;;
    esac
fi

status=0
tmp=/var/tmp/check.callback-$$
trap "rm -f $tmp.*; exit \$status" 0 1 2 3 15

if [ "$1" = "--precheck" ]
then
    shift
    echo "--- start pre-check ---"
    ./941 --check $1
    ./870 --check $1
    $sudo grep -E '^type=(AVC|SELINUX).*pcp' $audit >$1.pre-avc 2>/dev/null
    echo "before `wc -l <$1.pre-avc` AVC errors"
    echo "--- end pre-check ---"
    exit
fi

# CONFIGURE-ME
# Set abort to true if you want to stop QA as soon as a problem is
# seen, otherwise set it to false and the checks will be run and
# reported, but QA will continue on
# Aborting is good for hard problems, because you'd like to know which
# QA test is breaking things and there is no point continuing.
# Not aborting is good for transient problems, because the breakage may
# be repaired autonomously (usually timing related) or by a later QA
# test.
#
abort=false

# Check pmcd status
#
if ./941 --check $1 >$tmp.out 2>$tmp.err
then
    # ok
    :
else
    echo "check.callback: fail: pmcd not well!"
    cat $tmp.err $tmp.out
    $abort && status=1
fi

# Check pmlogger status
#
if ./870 --check $1 >$tmp.out.1 2>$tmp.err.1
then
    # ok
    :
else
    # may be a transient thing, e.g. cron-driven log rotation
    # wait a bit and try again
    #
    sleep 3

    if ./870 --check $1 >$tmp.out.2 2>$tmp.err.2
    then
	# ok this time
	:
    else
	# failed twice ... report first failure and if second failure
	# is different, report that also
	#
	cat $tmp.err.1 $tmp.out.1
	diff=false
	if cmp $tmp.err.1 $tmp.err.2 >/dev/null
	then
	    # strip timestamps from ./870 output, then cmp
	    #
	    sed -e '/^Now:/d' <$tmp.out.1 >$tmp.tmp.1
	    sed -e '/^Now:/d' <$tmp.out.2 >$tmp.tmp.2
	    if cmp $tmp.tmp.1 $tmp.tmp.2 >/dev/null
	    then
		:
	    else
		diff=true
	    fi
	else
	    diff=true
	fi
	if $diff_prog
	then
	    echo "... and different output from 3 seconds later"
	    cat $tmp.err.2 $tmp.out.2
	fi
	echo "check.callback: fail: pmlogger not well!"
	$abort && status=1
    fi
fi

# Some PMDAs have trouble starting on some VMs ... if one of these
# has failed, try restarting it ... skip ones that are OK or NOTREADY
# [pmcd.agent.status is 0 or 1]
#
pminfo -f pmcd.agent.status | grep 'value [^01]' >$tmp.out
if grep '"nfsclient"' $tmp.out >/dev/null
then
    pmstore pmcd.control.sighup 1 >/dev/null
    sleep 2
elif grep '"openmetrics"' $tmp.out >/dev/null
then
    pmstore pmcd.control.sighup 1 >/dev/null
    sleep 2
fi

# Check are all PMDAs alive and well
#
if pminfo -f pmcd.agent.status >$tmp.out 2>$tmp.err
then
    rm -f $tmp.pmdalist
    sed -n <$tmp.out \
	-e '/^  *inst /{
s/^  *inst \[//
s/ or / /
s/"//g
s/] value / /
p
}' \
    | while read domain name pmda_status
    do
	if [ "$pmda_status" != 0 -a "$pmda_status" != 1 ]
	then
	    echo "check.callback: fail: PMDA $name not well, status=$pmda_status!"
	    for log in $PCP_LOG_DIR/pmcd/pmcd.log $PCP_LOG_DIR/pmcd/$name.log* ]
	    do
		if [ -f "$log" ]
		then
		    echo "=== start $log ==="
		    ls -l $log
		    cat $log
		    echo "=== end $log ==="
		fi
	    done
	    $abort && status=1
	fi
	echo "$name" >>$tmp.pmdalist
    done
    if [ -f $tmp.pmdalist.prev ]
    then
	if diff -u $tmp.pmdalist.prev $tmp.pmdalist >$tmp.out
	then
	    :
	else
	    echo "check.callback: fail: installed PMDAs changed!"
	    cat $tmp.out
	    $abort && status=1
	fi
    fi
    cp $tmp.pmdalist $tmp.pmdalist.prev
else
    echo "check.callback: fail: pminfo not well!"
    cat $tmp.err $tmp.out
    $abort && status=1
fi

# Check the PMNS ... specifically looking for cases where a PMDA is
# Install(ed) and pmcd.conf is restored without a Remove ... this leaves
# the PMDA not in pmcd.conf, but with bogus entries in the PMNS
#
# CONFIGURE-ME
# - may need to add metrics (in the sed part) that are OK to return
#   'Unknown or illegal metric', but this is most unlikely
#
# NOTE:
# pmproxy metrics are a bit odd because they come from an aliased
# mmv PMDA that might take a while to get itself organized after
# a pmcd restart
#
pminfo -v -b 2000 2>&1 \
| grep -E '(Unknown or illegal metric)|(Unknown metric name)' \
| sed >$tmp.out \
    -e '/^sample.*\.bad\.unknown:/d' \
    -e '/^pmproxy\./d' \
    # end
if [ -s $tmp.out ]
then
    echo "check.callback: fail: PMNS not well!"
    cat $tmp.out
    pminfo -f pmcd.agent.status \
    | sed \
	-e '/^pmcd\.agent\.status/d' \
	-e 's/.* or "//' \
	-e 's/"] value / /' \
	-e '/^$/d' \
    | while read pmda pmda_status
    do
	if [ "$pmda_status" -ne 0 ]
	then
	    echo "Warning: $pmda PMDA exited? pmcd.agent.status=$pmda_status"
	    echo "Relevant pmcd.log lines ..."
	    grep "$pmda" "$PCP_LOG_DIR/pmcd/pmcd.log"
	    if [ -f "$PCP_LOG_DIR/pmcd/$pmda.log" ]
	    then
		echo "=== $pmda.log ==="
		cat "$PCP_LOG_DIR/pmcd/$pmda.log"
	    else
		echo "Warning: $PCP_LOG_DIR/pmcd/pmcd.log: not found"
	    fi
	fi
    done
    $abort && status=1
fi

# More pmcd/pmda/pmns checks and check for config files not returned
# to their pre-QA state
#
./1190 --check $1 >$tmp.out
if [ -s $tmp.out ]
then
    echo "check.callback: fail: see below!"
    cat $tmp.out
    $abort && status=1
fi

# Check audit log for any Security Enhanced Linux access denials
# related to PCP ...
#
$sudo grep -E '^type=(AVC|SELINUX).*pcp' $audit 2>/dev/null >$1.post-avc
if [ -f $1.pre-avc ]
then
    diff $1.pre-avc $1.post-avc \
    | sed -n -e '/> /s///p' >$tmp.out
    # Now there are _some_ AVCs reported here that are selinux snarfoos
    # rather than problems with PCP and the associated PCP policy files.
    # These ones have been triaged up the wazoo, and can safely be ignored
    # here.
    #
    case `admin/whatami`
    in
	vm39*RHEL\ 8.5*)
	    sed <$tmp.out >$tmp.tmp \
		-e '/{ write } .* comm="rpcinfo" name="rpcbind.sock" .* scontext=system_u:system_r:pcp_pmcd_t:s0 tcontext=system_u:object_r:rpcbind_var_run_t:s0 tclass=sock_file/d' \
		-e '/{ sys_resource } .* comm="pmproxy" capability=24  scontext=system_u:system_r:pcp_pmproxy_t:s0 tcontext=system_u:system_r:pcp_pmproxy_t:s0 tclass=capability/d' \
		-e '/{ sys_admin } .* comm="pmproxy" capability=21  scontext=system_u:system_r:pcp_pmproxy_t:s0 tcontext=system_u:system_r:pcp_pmproxy_t:s0 tclass=capability/d' \
	    # end
	    if diff $tmp.out $tmp.tmp >$tmp.new
	    then
		:
	    else
		echo "Note: check.callback chose to culled these new AVCs ..." >>$1.full
		sed -n -e '/> /s///p' <$tmp.new >>$1.full
		mv $tmp.tmp $tmp.out
	    fi
	    ;;
    esac
    if [ -s $tmp.out ]
    then
	echo "check.callback: fail: new SELinux/AVC denials"
	cat $tmp.out
	echo "after `wc -l <$1.post-avc` AVC errors"
	$abort && status=1
    fi
fi
rm -f $1.pre-avc $1.post-avc

rm -f $here/onetrip

exit