1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
|
#!/bin/sh
#
# Audit the "dictionary" of PCP "words" used by man-spell
#
# Copyright (c) 2024 Ken McDonell, Inc. All Rights Reserved.
#
usage="Usage: audit-man-dict [-d] [-i seed-file] [-[sS] save-file] [-D save-dict] [man-src-file ...]"
export LC_COLLATE=POSIX
if which ispell >/dev/null 2>&1
then
:
else
echo "Arrgh: ispell not installed, no dice"
exit 1
fi
if which man-spell >/dev/null 2>&1
then
:
else
echo "Arrgh: man-spell not on $PATH, no dice"
exit 1
fi
tmp=/tmp/audit-man-spell-$$
rm -f $tmp.*
status=0 # success is the default
trap "rm -f $tmp.*; exit \$status" 0 1 2 3 15
# file(1) does not work, so need some heuristics
# ... assume existance of $1 already established
#
_istroff()
{
# shell or other script?
head -1 "$1" | grep -q '^#!' && return 1
# generated by podman?
head -1 "$1" | grep -q 'by Pod::Man' && return 1
# -man macros?
grep -q '^\.SH ' <"$1" && return 0
return 1
}
seed=''
save=''
debug=0
save_dict=''
while getopts "dD:i:s:S:?" c
do
case $c
in
d) debug=`expr $debug + 1`
;;
D) # save dictionary strings
save_dict="$OPTARG"
;;
i) # seed "words" to start with, probably the result of a
# prior run and saved with -s
#
if [ ! -f "$OPTARG" ]
then
echo "Error: $OPTARG not found for -i"
status=1
exit
fi
seed="$OPTARG"
;;
s|S) # save "words" == -i file if any, + new ones from
# this run, all sorted and duplicate removed
# -s save-file is precious
# -S clobber an existing save-file
#
if [ "$c" = s -a -f "$OPTARG" ]
then
echo "Error: $OPTARG already exists for -s"
status=1
exit
fi
save="$OPTARG"
;;
?) echo >&2 "$usage"
exit
;;
esac
done
shift `expr $OPTIND - 1`
# really only works sanely if you're at the top of the man
# pages ...
#
case `pwd`
in
*/man) ;;
*) echo >&2 "Warning: pwd: `pwd` is not the top of the man dirs"
;;
esac
# Pass 1
# - add "seed" words if any to "words" file
# - for each input file
# + break into one "word" per line using a crude hack:
# strip troff in-line find changes, then not alphabetic
# and not _ and not ' => \n, then strip leading
# or trailing ' (leaving behind posessive forms like
# + append to "words" file
# - sort | uniq "words" file
# - if -s, save "words" file
#
if [ -n "$seed" ]
then
cp "$seed" $tmp.words
else
touch $tmp.words
fi
if [ $# -gt 0 ]
then
for arg; do echo $arg; done
else
find * -name '*.[1-9]*' \( -type f -o -type l \)
fi \
| while read file
do
if [ ! -f "$file" ]
then
echo "$file: not found"
continue
fi
if _istroff "$file"
then
# smells like troff man source ...
#
rm -f $tmp.errs
[ $debug -gt 0 ] && echo "$file:"
sed <"$file" \
-e 's/\\f[A-Z]//g' \
-e 's/\\f(..//g' \
| tr -c "A-Za-z_'" '\012' \
| sed >>$tmp.words \
-e "s/^''*//" \
-e "s/''*\$//" \
# end
else
[ $debug -gt 0 ] && echo >&2 "$file: skipped (not troff)"
fi
done
sort <$tmp.words \
| uniq >$tmp.tmp
mv $tmp.tmp $tmp.words
[ -n "$save" ] && cp $tmp.words "$save"
# Pass 2
# - get the "dictionary" of PCP words from man-spell for an empty input
# file
#
echo >$tmp.tmp
man-spell -d $tmp.tmp 2>&1 \
| sed \
-e '/Common words/d' \
-e '/ skipped (not troff)/d' \
-e '/^$/q' \
| tr ' ' '\012' \
| sed -e '/^$/d' \
| sort \
| uniq >$tmp.tmp
# break dictionary into strings and patterns
#
awk <$tmp.tmp >$tmp.strings '
/\[/ || /\?/ { print >"'$tmp.patterns'"; next }
{ print }'
[ -n "$save_dict" ] && cp $tmp.strings "$save_dict"
grep "[^A-Za-z_']" $tmp.strings >$tmp.tmp
if [ -s $tmp.tmp ]
then
echo "Warning: dodgey? 'cause ispell won't parse text into these words ..."
cat $tmp.tmp
fi
echo "`wc -l <$tmp.strings | sed -e 's/ //g'` strings in the PCP dictionary"
echo "`wc -l <$tmp.patterns | sed -e 's/ //g'` patterns in the PCP dictionary"
sed <$tmp.patterns >$tmp.sed -e 's;.*;/^&$/p;'
#debug# cat $tmp.sed
# $tmp.words - all the "words" in the man page(s)
# $tmp.strings - strings from PCP dictionary
#
comm -1 -2 $tmp.words $tmp.strings >$tmp.tmp
echo "`wc -l <$tmp.tmp | sed -e 's/ //'g` dictionary string matches"
#debug# cat $tmp.tmp
# dictionary strings that match no words in any man page
#
comm -1 -3 $tmp.words $tmp.strings >$tmp.tmp
echo "`wc -l <$tmp.tmp | sed -e 's/ //'g` dictionary strings not in any man page"
[ "$debug" -gt 0 ] && cat $tmp.tmp
sed -E -n -f $tmp.sed <$tmp.words >$tmp.tmp
echo "`wc -l <$tmp.tmp | sed -e 's/ //'g` dictionary pattern matches"
#debug# cat $tmp.tmp
# strings that match no patterns - TODO
exit
|