File: audit-man-dict

package info (click to toggle)
pcp 6.3.8-1
  • links: PTS
  • area: main
  • in suites: sid, trixie
  • size: 235,180 kB
  • sloc: ansic: 1,253,622; sh: 173,998; xml: 160,490; cpp: 83,331; python: 20,482; perl: 18,302; yacc: 6,886; makefile: 2,955; lex: 2,862; fortran: 60; java: 52
file content (207 lines) | stat: -rwxr-xr-x 4,597 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/bin/sh
#
# Audit the "dictionary" of PCP "words" used by man-spell
#
# Copyright (c) 2024 Ken McDonell, Inc.  All Rights Reserved.
#

usage="Usage: audit-man-dict [-d] [-i seed-file] [-[sS] save-file] [-D save-dict] [man-src-file ...]"

export LC_COLLATE=POSIX

if which ispell >/dev/null 2>&1
then
    :
else
    echo "Arrgh: ispell not installed, no dice"
    exit 1
fi

if which man-spell >/dev/null 2>&1
then
    :
else
    echo "Arrgh: man-spell not on $PATH, no dice"
    exit 1
fi

tmp=/tmp/audit-man-spell-$$
rm -f $tmp.*
status=0	# success is the default
trap "rm -f $tmp.*; exit \$status" 0 1 2 3 15

# file(1) does not work, so need some heuristics
# ... assume existance of $1 already established
#
_istroff()
{
    # shell or other script?
    head -1 "$1" | grep -q '^#!' && return 1
    # generated by podman?
    head -1 "$1" | grep -q 'by Pod::Man' && return 1
    # -man macros?
    grep -q '^\.SH ' <"$1" && return 0
    return 1
}

seed=''
save=''
debug=0
save_dict=''
while getopts "dD:i:s:S:?" c
do
    case $c
    in
	d)	debug=`expr $debug + 1`
		;;
	D)	# save dictionary strings
		save_dict="$OPTARG"
		;;
	i)	# seed "words" to start with, probably the result of a
		# prior run and saved with -s
		#
		if [ ! -f "$OPTARG" ]
		then
		    echo "Error: $OPTARG not found for -i"
		    status=1
		    exit
		fi
		seed="$OPTARG"
		;;
	s|S)	# save "words" == -i file if any, + new ones from
		# this run, all sorted and duplicate removed
		# -s save-file is precious
		# -S clobber an existing save-file
		#
		if [ "$c" = s -a -f "$OPTARG" ]
		then
		    echo "Error: $OPTARG already exists for -s"
		    status=1
		    exit
		fi
		save="$OPTARG"
		;;
	?)	echo >&2 "$usage"
		exit
		;;
    esac
done
shift `expr $OPTIND - 1`

# really only works sanely if you're at the top of the man
# pages ...
#
case `pwd`
in
    */man)	;;
    *)		echo >&2 "Warning: pwd: `pwd` is not the top of the man dirs"
    		;;
esac

# Pass 1
# - add "seed" words if any to "words" file
# - for each input file
#   + break into one "word" per line using a crude hack:
#     strip troff in-line find changes, then not alphabetic
#     and not _ and not ' => \n, then strip leading
#     or trailing ' (leaving behind posessive forms like 
#   + append to "words" file
# - sort | uniq "words" file
# - if -s, save "words" file
#
if [ -n "$seed" ]
then
    cp "$seed" $tmp.words
else
    touch $tmp.words
fi
if [ $# -gt 0 ]
then
    for arg; do echo $arg; done
else
    find * -name '*.[1-9]*' \( -type f -o -type l \)
fi \
| while read file
do
    if [ ! -f "$file" ]
    then
	echo "$file: not found"
	continue
    fi
    if _istroff "$file"
    then
	# smells like troff man source ...
	#
	rm -f $tmp.errs
	[ $debug -gt 0 ] && echo "$file:"
	sed <"$file" \
	    -e 's/\\f[A-Z]//g' \
	    -e 's/\\f(..//g' \
	| tr -c "A-Za-z_'" '\012' \
	| sed >>$tmp.words \
	    -e "s/^''*//" \
	    -e "s/''*\$//" \
	# end
    else
	[ $debug -gt 0 ] && echo >&2 "$file: skipped (not troff)"
    fi
done

sort <$tmp.words \
| uniq >$tmp.tmp
mv $tmp.tmp $tmp.words
[ -n "$save" ] && cp $tmp.words "$save"

# Pass 2
# - get the "dictionary" of PCP words from man-spell for an empty input
#   file
#
echo >$tmp.tmp
man-spell -d $tmp.tmp 2>&1 \
| sed \
    -e '/Common words/d' \
    -e '/ skipped (not troff)/d' \
    -e '/^$/q' \
| tr ' ' '\012' \
| sed -e '/^$/d' \
| sort \
| uniq >$tmp.tmp

# break dictionary into strings and patterns
#
awk <$tmp.tmp >$tmp.strings '
/\[/ || /\?/	{ print >"'$tmp.patterns'"; next }
		{ print }'
[ -n "$save_dict" ] && cp $tmp.strings "$save_dict"

grep "[^A-Za-z_']" $tmp.strings >$tmp.tmp
if [ -s $tmp.tmp ]
then
    echo "Warning: dodgey? 'cause ispell won't parse text into these words ..."
    cat $tmp.tmp
fi
echo "`wc -l <$tmp.strings | sed -e 's/ //g'` strings in the PCP dictionary"
echo "`wc -l <$tmp.patterns | sed -e 's/ //g'` patterns in the PCP dictionary"
sed <$tmp.patterns >$tmp.sed -e 's;.*;/^&$/p;'
#debug# cat $tmp.sed

# $tmp.words		- all the "words" in the man page(s)
# 	$tmp.strings	- strings from PCP dictionary
#
comm -1 -2 $tmp.words $tmp.strings >$tmp.tmp
echo "`wc -l <$tmp.tmp | sed -e 's/ //'g` dictionary string matches"
#debug# cat $tmp.tmp

# dictionary strings that match no words in any man page
#
comm -1 -3 $tmp.words $tmp.strings >$tmp.tmp
echo "`wc -l <$tmp.tmp | sed -e 's/ //'g` dictionary strings not in any man page"
[ "$debug" -gt 0 ] && cat $tmp.tmp

sed -E -n -f $tmp.sed <$tmp.words >$tmp.tmp
echo "`wc -l <$tmp.tmp | sed -e 's/ //'g` dictionary pattern matches"
#debug# cat $tmp.tmp

# strings that match no patterns - TODO

exit