File: check.callback.sample

package info (click to toggle)
pcp 6.3.8-1
links: PTS
area: main
in suites: forky, sid, trixie
size: 235,180 kB
sloc: ansic: 1,253,622; sh: 173,998; xml: 160,490; cpp: 83,331; python: 20,482; perl: 18,302; yacc: 6,886; makefile: 2,955; lex: 2,862; fortran: 60; java: 52
file content (338 lines) | stat: -rwxr-xr-x 9,174 bytes
parent folder | download | duplicates (2)
#!/bin/sh
#
# Trying to track down assorted transient QA environment issues.
#
# Run from check after each test completes, but you need to ensure
# the script is called check.callback and it is executable.
#
# Usage: check.callback [--precheck] [seq#]
#
# Note: This is a SAMPLE script, you should either
#	(a) $ ln check.callback.sample check.callback
#           or probably better to ensure tracking any upstream changes
#           $ ln -s check.callback.sample check.callback
#	    to have it activated "as is", or
#	(b) $ cp check.callback.sample check.callback
#	    and then edit check.callback to suit local needs ... see
#	    in particular the CONFIGURE-ME comments below
#

if [ -z "$PCP_LOG_DIR" ]
then
    # running standalone (not called from check), so need to populate
    # environment
    # (logic copied from common.rc)
    #
    # source the PCP configuration environment variables
    if [ -r $PCP_DIR/etc/pcp.env ]
    then
	. $PCP_DIR/etc/pcp.env
    else
	echo "Error: unable to read $PCP_DIR/etc/pcp.env!" >&2
	exit 1
    fi
fi

# Security Enhanced Linux log messages
audit=/var/log/audit/audit.log
sudo=`which sudo`

# Did pmlogger_daily get run as expected?
#
# Do this at most once each time check is run ... if it fails it will
# fail for ALL QA tests, which is not helpful.  The file
# $here/check.onetrip is created in "common", so it will be there at
# the start of any check (or recheck or check-flakey) run.
#
if [ -f $here/check.onetrip ]
then
    # But don't bother if it is just after midnight!
    #
    case `pmdate %H:%M`
    in
	00:[012]*)
	    ;;
	*)
	    yesterday=`pmdate -1d %Y%m%d`
	    host=`hostname`
	    case $host
	    in
		# CONFIGURE-ME
		# - may need to add extra hosts here to enable this test
		#
		bozo|bozo-vm|vm01|vm03|vm36)
			if [ ! -f $PCP_LOG_DIR/pmlogger/$host/$yesterday.meta.xz -a ! -f $PCP_LOG_DIR/pmlogger/$host/$yesterday.meta ]
			then
			    # after a virgin install, or cleaning out the directory,
			    # or a hostname change, there will be NO $yesterday files
			    # at all, so be quiet if this is the case ...
			    #
			    nfile=`find  $PCP_LOG_DIR/pmlogger/$host -type f | grep "^$PCP_LOG_DIR/pmlogger/$host/$yesterday.*\\.meta" | wc -l | sed -e 's/ //g'`
			    if [ "$nfile" -gt 0 ]
			    then
				echo "check.callback: fail: pmlogger_daily not well!"
				echo "Missing $yesterday archive but $nfile component archives ..."
				ls -l $PCP_LOG_DIR/pmlogger/$host/$yesterday.*
			    fi
			fi
			;;
	    esac
	    ;;
    esac
fi

status=0
tmp=/var/tmp/check.callback-$$
trap "rm -f $tmp.*; exit \$status" 0 1 2 3 15

if [ "$1" = "--precheck" ]
then
    shift
    echo "--- start pre-check ---"
    ./941 --check $1
    ./870 --check $1
    $sudo grep -E '^type=(AVC|SELINUX).*pcp' $audit >$1.pre-avc 2>/dev/null
    echo "before `wc -l <$1.pre-avc` AVC errors"
    echo "--- end pre-check ---"
    exit
fi

# CONFIGURE-ME
# Set abort to true if you want to stop QA as soon as a problem is
# seen, otherwise set it to false and the checks will be run and
# reported, but QA will continue on
# Aborting is good for hard problems, because you'd like to know which
# QA test is breaking things and there is no point continuing.
# Not aborting is good for transient problems, because the breakage may
# be repaired autonomously (usually timing related) or by a later QA
# test.
#
abort=false

# Check pmcd status
#
if ./941 --check $1 >$tmp.out 2>$tmp.err
then
    # ok
    :
else
    echo "check.callback: fail: pmcd not well!"
    cat $tmp.err $tmp.out
    $abort && status=1
fi

# Check pmlogger status
#
if ./870 --check $1 >$tmp.out.1 2>$tmp.err.1
then
    # ok
    :
else
    # may be a transient thing, e.g. cron-driven log rotation
    # wait a bit and try again
    #
    sleep 3

    if ./870 --check $1 >$tmp.out.2 2>$tmp.err.2
    then
	# ok this time
	:
    else
	# failed twice ... report first failure and if second failure
	# is different, report that also
	#
	cat $tmp.err.1 $tmp.out.1
	diff=false
	if cmp $tmp.err.1 $tmp.err.2 >/dev/null
	then
	    # strip timestamps from ./870 output, then cmp
	    #
	    sed -e '/^Now:/d' <$tmp.out.1 >$tmp.tmp.1
	    sed -e '/^Now:/d' <$tmp.out.2 >$tmp.tmp.2
	    if cmp $tmp.tmp.1 $tmp.tmp.2 >/dev/null
	    then
		:
	    else
		diff=true
	    fi
	else
	    diff=true
	fi
	if $diff_prog
	then
	    echo "... and different output from 3 seconds later"
	    cat $tmp.err.2 $tmp.out.2
	fi
	echo "check.callback: fail: pmlogger not well!"
	$abort && status=1
    fi
fi

# Some PMDAs have trouble starting on some VMs ... if one of these
# has failed, try restarting it ... skip ones that are OK or NOTREADY
# [pmcd.agent.status is 0 or 1]
#
pminfo -f pmcd.agent.status | grep 'value [^01]' >$tmp.out
if grep '"nfsclient"' $tmp.out >/dev/null
then
    pmstore pmcd.control.sighup 1 >/dev/null
    sleep 2
elif grep '"openmetrics"' $tmp.out >/dev/null
then
    pmstore pmcd.control.sighup 1 >/dev/null
    sleep 2
fi

# Check are all PMDAs alive and well
#
if pminfo -f pmcd.agent.status >$tmp.out 2>$tmp.err
then
    rm -f $tmp.pmdalist
    sed -n <$tmp.out \
	-e '/^  *inst /{
s/^  *inst \[//
s/ or / /
s/"//g
s/] value / /
p
}' \
    | while read domain name pmda_status
    do
	if [ "$pmda_status" != 0 -a "$pmda_status" != 1 ]
	then
	    echo "check.callback: fail: PMDA $name not well, status=$pmda_status!"
	    for log in $PCP_LOG_DIR/pmcd/pmcd.log $PCP_LOG_DIR/pmcd/$name.log* ]
	    do
		if [ -f "$log" ]
		then
		    echo "=== start $log ==="
		    ls -l $log
		    cat $log
		    echo "=== end $log ==="
		fi
	    done
	    $abort && status=1
	fi
	echo "$name" >>$tmp.pmdalist
    done
    if [ -f $tmp.pmdalist.prev ]
    then
	if diff -u $tmp.pmdalist.prev $tmp.pmdalist >$tmp.out
	then
	    :
	else
	    echo "check.callback: fail: installed PMDAs changed!"
	    cat $tmp.out
	    $abort && status=1
	fi
    fi
    cp $tmp.pmdalist $tmp.pmdalist.prev
else
    echo "check.callback: fail: pminfo not well!"
    cat $tmp.err $tmp.out
    $abort && status=1
fi

# Check the PMNS ... specifically looking for cases where a PMDA is
# Install(ed) and pmcd.conf is restored without a Remove ... this leaves
# the PMDA not in pmcd.conf, but with bogus entries in the PMNS
#
# CONFIGURE-ME
# - may need to add metrics (in the sed part) that are OK to return
#   'Unknown or illegal metric', but this is most unlikely
#
# NOTE:
# pmproxy metrics are a bit odd because they come from an aliased
# mmv PMDA that might take a while to get itself organized after
# a pmcd restart
#
pminfo -v -b 2000 2>&1 \
| grep -E '(Unknown or illegal metric)|(Unknown metric name)' \
| sed >$tmp.out \
    -e '/^sample.*\.bad\.unknown:/d' \
    -e '/^pmproxy\./d' \
    # end
if [ -s $tmp.out ]
then
    echo "check.callback: fail: PMNS not well!"
    cat $tmp.out
    pminfo -f pmcd.agent.status \
    | sed \
	-e '/^pmcd\.agent\.status/d' \
	-e 's/.* or "//' \
	-e 's/"] value / /' \
	-e '/^$/d' \
    | while read pmda pmda_status
    do
	if [ "$pmda_status" -ne 0 ]
	then
	    echo "Warning: $pmda PMDA exited? pmcd.agent.status=$pmda_status"
	    echo "Relevant pmcd.log lines ..."
	    grep "$pmda" "$PCP_LOG_DIR/pmcd/pmcd.log"
	    if [ -f "$PCP_LOG_DIR/pmcd/$pmda.log" ]
	    then
		echo "=== $pmda.log ==="
		cat "$PCP_LOG_DIR/pmcd/$pmda.log"
	    else
		echo "Warning: $PCP_LOG_DIR/pmcd/pmcd.log: not found"
	    fi
	fi
    done
    $abort && status=1
fi

# More pmcd/pmda/pmns checks and check for config files not returned
# to their pre-QA state
#
./1190 --check $1 >$tmp.out
if [ -s $tmp.out ]
then
    echo "check.callback: fail: see below!"
    cat $tmp.out
    $abort && status=1
fi

# Check audit log for any Security Enhanced Linux access denials
# related to PCP ...
#
$sudo grep -E '^type=(AVC|SELINUX).*pcp' $audit 2>/dev/null >$1.post-avc
if [ -f $1.pre-avc ]
then
    diff $1.pre-avc $1.post-avc \
    | sed -n -e '/> /s///p' >$tmp.out
    # Now there are _some_ AVCs reported here that are selinux snarfoos
    # rather than problems with PCP and the associated PCP policy files.
    # These ones have been triaged up the wazoo, and can safely be ignored
    # here.
    #
    case `admin/whatami`
    in
	vm39*RHEL\ 8.5*)
	    sed <$tmp.out >$tmp.tmp \
		-e '/{ write } .* comm="rpcinfo" name="rpcbind.sock" .* scontext=system_u:system_r:pcp_pmcd_t:s0 tcontext=system_u:object_r:rpcbind_var_run_t:s0 tclass=sock_file/d' \
		-e '/{ sys_resource } .* comm="pmproxy" capability=24  scontext=system_u:system_r:pcp_pmproxy_t:s0 tcontext=system_u:system_r:pcp_pmproxy_t:s0 tclass=capability/d' \
		-e '/{ sys_admin } .* comm="pmproxy" capability=21  scontext=system_u:system_r:pcp_pmproxy_t:s0 tcontext=system_u:system_r:pcp_pmproxy_t:s0 tclass=capability/d' \
	    # end
	    if diff $tmp.out $tmp.tmp >$tmp.new
	    then
		:
	    else
		echo "Note: check.callback chose to culled these new AVCs ..." >>$1.full
		sed -n -e '/> /s///p' <$tmp.new >>$1.full
		mv $tmp.tmp $tmp.out
	    fi
	    ;;
    esac
    if [ -s $tmp.out ]
    then
	echo "check.callback: fail: new SELinux/AVC denials"
	cat $tmp.out
	echo "after `wc -l <$1.post-avc` AVC errors"
	$abort && status=1
    fi
fi
rm -f $1.pre-avc $1.post-avc

rm -f $here/onetrip

exit