File: break

package info (click to toggle)
magyarispell 1.2%2Brepack-2
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 4,952 kB
  • sloc: sh: 1,032; awk: 404; makefile: 265; sed: 153; lisp: 102
file content (36 lines) | stat: -rwxr-xr-x 735 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/bin/sh
# breaking a file to words, 1.2v (c) Lszl Nmeth, BSD License
# 1.2: fix space, HTML, etc. problems.
# 1.1: fix tabulator, ISO-8859-2, etc. problems
break_lines()
{
tr -c '\-a-zA-Z\241-\254\256-\326\330-\366\370-\377' ' ' | # ISO-8859-2
tr ' ' '\n' | 
sort | 
uniq
}

case $# in
		0) echo "break -- collect words from text and html files
Usage: break [-html] file(s)"; exit 0;;
esac

case $1 in
 -html)
 shift
 cat $@ | 
 tr '\n' ' ' | # make 1 line from files
 sed 's/</\
 </g' | # break at HTML tags
 sed '/<head/I,/\/head/Id' |
 sed '/<script/I,/\/script/Id' |
 sed '/<style/I,/\/style/Id' |
 sed '/<code/I,/\/code/Id' |
 sed '/<pre/I,/\/pre/Id' |
 sed 's/<[^>]*>//g' | break_lines;;

 *)
 cat $@ | break_lines;;

esac