1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
|
#!/bin/sh
#
# report issues in the collection of PCP archives managed by
# pmlogger_check and pmlogger_daily
#
. /etc/pcp.conf
tmp=/var/tmp/check-archives.$$
echo 0 >$tmp.sts
trap "_exit; exit \$sts" 0 1 2 3 15
_exit()
{
sts=`cat $tmp.sts`
rm -f $tmp.*
}
# need to salt away the biggest status seen so far ...
# $1 values
# 0 no error
# 1 warning
# 2 error in archives' status
# 4 fatal botch in this script or environment
#
_set_sts()
{
_sts=`cat $tmp.sts`
[ $_sts -lt $1 ] && echo $1 >$tmp.sts
}
# strip any known compression suffix from the filename(s) on stdin
#
_strip_compress_suffix()
{
sed \
-e 's/\.xz$//' \
-e 's/\.lzma$//' \
-e 's/\.bz2$//' \
-e 's/\.bz$//' \
-e 's/\.gz$//' \
-e 's/\.Z$//' \
-e 's/\.z$//' \
# end
}
progname=`basename $0`
if [ ! -d $PCP_LOG_DIR/pmlogger ]
then
echo "$progname: Error: $PCP_LOG_DIR/pmlogger does not exist"
_set_sts 4
exit
fi
if cd $PCP_LOG_DIR/pmlogger
then
:
else
echo "$progname: Error: cannot chdir to $PCP_LOG_DIR/pmlogger"
_set_sts 4
exit
fi
# recent script log and timestamp files?
#
find pmlogger_* -maxdepth 0 -type f -mtime -1 >$tmp.tmp
for stamp in pmlogger_daily.stamp # TODO pmlogger_daily_report.stamp
do
if grep "^$stamp\$" <$tmp.tmp >/dev/null
then
:
else
echo
echo "Warning: no $stamp file in the last 24 hours"
ls -l $stamp* 2>&1 | sed -e 's/^/ /'
_set_sts 1
fi
done
# remove "expected" lines from logs and report if anything remains
#
sed <$tmp.tmp \
-e '/stamp$/d' \
-e '/stamp.prev$/d' \
| while read log
do
rm -f $tmp.ok
case $log
in
pmlogger_check.log*)
sed <$log >$tmp.tmp \
-e '/^pmlogger_check: \[.*[0-9]]$/d' \
-e '/^Restarting primary pmlogger for host .* done$/d' \
-e '/^Latest folio created for .*[0-9]$/d' \
-e '/^Duplicate archive basename \.\.\. rename .*\*$/d' \
-e '/^\.\.\. logging for host ".*" unchanged$/d' \
# end
;;
pmlogger_daily-K.log*)
sed <$log >$tmp.tmp \
-e '/^pmlogger_daily: \[.*[0-9]]$/d' \
-e '/Error: no pmlogger instance running for host/d' \
-e '/^\.\.\. logging for host ".*" unchanged$/d' \
# end
;;
pmlogger_daily_report.log*)
sed <$log >$tmp.tmp \
-e '/PM_ERR_NAME Unknown metric name/d' \
# end
;;
*)
cat $log >$tmp.tmp
;;
esac
if [ -s $tmp.tmp ]
then
echo
echo "Warning: Unexpected lines (!) in $log"
ls -l $log 2>&1 | sed -e 's/^/ /'
diff -e $log $tmp.tmp \
| sed -n >$tmp.sed \
-e '/^[0-9][0-9]*d/s/d.*/s\/^\/s\//p' \
-e '/^[0-9][0-9]*,[0-9][0-9]*d/s/d.*/s\/^\/s\//p' \
# end
if [ -s $tmp.sed ]
then
# some expected lines
:
else
# all unexpected lines
echo "s/^/s/" >$tmp.sed
fi
sed -e 's/^/ /' <$log \
| sed -f $tmp.sed \
| sed \
-e '/^ /s/ /!/' \
-e '/^s/s/s / /' \
-e 's/^/ /' \
# end
fi
_set_sts 1
done
# traverse each directory below here ... assume it contains PCP
# archives of interest
#
find * -maxdepth 0 -type d \
| sed \
-e '/^SaveLogs/d' \
| while read dir
do
if cd $dir
then
:
else
echo "$progname: Error: cannot chdir to $PCP_LOG_DIR/pmlogger/$dir"
_set_sts 4
exit
fi
echo
echo "=== $dir ==="
# all the files of interest begin YYYY or YY (old school)
#
if [ "`echo [0-9]*`" = '[0-9]*' ]
then
echo "Warning: no PCP archives here?"
_set_sts 1
cd ..
continue
fi
ls [0-9]* \
| _strip_compress_suffix \
| sed \
-e 's/\.[0-9][0-9]*$//' \
-e 's/\.index$//' \
-e 's/\.meta$//' \
| sort \
| uniq >$tmp.list
# for each archive base name in $tmp.list expect ...
#
for base in `cat $tmp.list`
do
# ... an index file (warn if missing)
#
rm -f $tmp.err
if [ "`echo $base.index*`" = "$base.index*" ]
then
if [ ! -f $tmp.err ]
then
echo
touch $tmp.err
fi
echo "Error: $base.index is missing"
fi
# ... and a metadata file (error if missing)
#
if [ "`echo $base.meta*`" = "$base.meta*" ]
then
if [ ! -f $tmp.err ]
then
echo
touch $tmp.err
fi
echo "Error: $base.meta is missing"
fi
# ... and data volume 0 file (error if missing)
#
if [ -f $base.0 -o -f "`echo $base.0.*`" ]
then
# we expect all the volumes from 0, 1, 2, ... N
# where N is the maximum volume seen for this base name
#
for file in `echo $base.[0-9]*`
do
maxvol=`echo "$file" \
| _strip_compress_suffix \
| sed -e 's/.*\.//' \
| sort -rn \
| head -1`
if [ -z "$maxvol" ]
then
echo "Botch: could not get maxvol for base=$base"
_set_sts 4
exit
else
# already tested for volume 0 above, so just the others
# (if any)
#
vol=1
while [ $vol -lt $maxvol ]
do
if [ -f $base.$vol -o -f "`echo $base.$vol.*`" ]
then
:
else
if [ ! -f $tmp.err ]
then
echo
touch $tmp.err
fi
echo "Error: $base.$vol: data volume is missing"
fi
vol=`expr $vol + 1`
done
fi
done
else
if [ ! -f $tmp.err ]
then
echo
touch $tmp.err
fi
echo "Error: $base.0 is missing"
fi
if [ -f $tmp.err ]
then
ls -l $base* 2>&1 | sed -e 's/^/ /'
_set_sts 2
else
# all the bits-n-pieces appear to be in place, what does
# pmlogcheck make of it?
#
if pmlogcheck -w $base >$tmp.tmp
then
:
else
echo "Botch: pmlogcheck -w $base failed!"
_set_sts 4
exit
fi
if [ -s $tmp.tmp ]
then
echo "Error: $base: pmlogcheck failure"
sed -e 's/^/ /' <$tmp.tmp
_set_sts 2
fi
fi
done
# Now, go back 31 days ... find the first archive and then expect
# to see an archive for each date between then and yesterday.
# If the archive basename is not YYYYMMDD, then a merge has been
# missed, so report this also.
#
rm -f $tmp.found
back=31
while [ "$back" -gt 0 ]
do
date=`pmdate -${back}d '%Y%m%d'`
if [ -z "$date" ]
then
echo "Botch: pmdate -${back}d failed!"
_set_sts 4
exit
fi
grep "$date" $tmp.list >$tmp.tmp
if [ -s $tmp.tmp ]
then
[ -f $tmp.found ] || touch $tmp.found
# expect exactly one archive
#
narch="`wc -l <$tmp.tmp | sed -e 's/[ ]//g'`"
if [ "$narch" -ne 1 ]
then
echo "Error: multiple archives for $date, merge not done?"
ls -l $date* 2>&1 | sed -e 's/^/ /'
_set_sts 2
else
# and expect it to match post-daily base name format
# YYYYMMDD, not YYYYMMDD.HH.MM or YYYYMMDD.HH.MM-seq#
#
if grep "^$date\$" $tmp.list >/dev/null
then
:
else
echo "Error: pmlogger_daily not run for $date?"
ls -l $date* 2>&1 | sed -e 's/^/ /'
fi
fi
else
if [ -f $tmp.found ]
then
echo "Error: no archive(s) for $date"
_set_sts 2
fi
fi
back=`expr $back - 1`
done
cd ..
done
|