1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
|
#!/bin/sh
# breaking a file to words, 1.2v (c) Lszl Nmeth, BSD License
# 1.2: fix space, HTML, etc. problems.
# 1.1: fix tabulator, ISO-8859-2, etc. problems
break_lines()
{
tr -c '\-a-zA-Z\241-\254\256-\326\330-\366\370-\377' ' ' | # ISO-8859-2
tr ' ' '\n' |
sort |
uniq
}
case $# in
0) echo "break -- collect words from text and html files
Usage: break [-html] file(s)"; exit 0;;
esac
case $1 in
-html)
shift
cat $@ |
tr '\n' ' ' | # make 1 line from files
sed 's/</\
</g' | # break at HTML tags
sed '/<head/I,/\/head/Id' |
sed '/<script/I,/\/script/Id' |
sed '/<style/I,/\/style/Id' |
sed '/<code/I,/\/code/Id' |
sed '/<pre/I,/\/pre/Id' |
sed 's/<[^>]*>//g' | break_lines;;
*)
cat $@ | break_lines;;
esac
|