File: htmlscan.mll

package info (click to toggle)
spamoracle 1.6-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 296 kB
  • sloc: ml: 1,380; makefile: 135
file content (218 lines) | stat: -rw-r--r-- 7,568 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
(***********************************************************************)
(*                                                                     *)
(*                 SpamOracle -- a Bayesian spam filter                *)
(*                                                                     *)
(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
(*                                                                     *)
(*  Copyright 2002 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  This file is distributed under the terms of the   *)
(*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
(*                                                                     *)
(***********************************************************************)

(* $Id$ *)

(** Approximate HTML scanner.  Extracts words from HTML text,
    as well as certain parameters of certain tags (e.g. URLs). *)

{
module StringSet = Set.Make(String)
module StringMap = Map.Make(String)

let re_url_encoding = Str.regexp "%\\([0-9A-Fa-f][0-9A-Fa-f]\\)"

let decode_url_percent s =
  let n = int_of_string ("0x" ^ Str.matched_group 1 s) in
  String.make 1 (Char.chr n)

let decode_url s =
  Str.global_substitute re_url_encoding decode_url_percent s

let entity_table =
  List.fold_left
    (fun t (s, c) -> StringMap.add s c t)
    StringMap.empty
    ["amp", '&';          "lt", '<';           "gt", '>';
     "nbsp", '\160';     
     "Agrave", '\192';    "Aacute", '\193';    "Acirc", '\194';
     "Atilde", '\195';    "Auml", '\196';      "Aring", '\197';
     "AElig", '\198';     "Ccedil", '\199';    "Egrave", '\200';
     "Eacute", '\201';    "Ecirc", '\202';     "Euml", '\203';
     "Igrave", '\204';    "Iacute", '\205';    "Icirc", '\206';
     "Iuml", '\207';      "ETH", '\208';       "Ntilde", '\209';
     "Ograve", '\210';    "Oacute", '\211';    "Ocirc", '\212';
     "Otilde", '\213';    "Ouml", '\214';      "times", '\215';
     "Oslash", '\216';    "Ugrave", '\217';    "Uacute", '\218';
     "Ucirc", '\219';     "Uuml", '\220';      "Yacute", '\221';
     "THORN", '\222';     "szlig", '\223';     "agrave", '\224';
     "aacute", '\225';    "acirc", '\226';     "atilde", '\227';
     "auml", '\228';      "aring", '\229';     "aelig", '\230';
     "ccedil", '\231';    "egrave", '\232';    "eacute", '\233';
     "ecirc", '\234';     "euml", '\235';      "igrave", '\236';
     "iacute", '\237';    "icirc", '\238';     "iuml", '\239';
     "eth", '\240';       "ntilde", '\241';    "ograve", '\242';
     "oacute", '\243';    "ocirc", '\244';     "otilde", '\245';
     "ouml", '\246';      "divide", '\247';    "oslash", '\248';
     "ugrave", '\249';    "uacute", '\250';    "ucirc", '\251';
     "uuml", '\252';      "yacute", '\253';    "thorn", '\254';
     "yuml", '\255']    

let word_breaking_tags =
  List.fold_right StringSet.add
    [ "p"; "br"; "ul"; "ol"; "dt"; "li"; "dd"; "table"; "tr"; "th"; "td";
      "img"; "div"; "blockquote"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6";
      "address" ]
    StringSet.empty

module Output = struct

  type t = { txt: Buffer.t; extra: Buffer.t }

  let create() = { txt = Buffer.create 2048; extra = Buffer.create 256 }

  let clear ob =
    Buffer.clear ob.txt;
    Buffer.clear ob.extra

  let contents ob =
    Buffer.add_char ob.txt '\n';
    Buffer.add_buffer ob.txt ob.extra;
    Buffer.contents ob.txt

  let char ob c =
    Buffer.add_char ob.txt c

  let string ob s =
    Buffer.add_string ob.txt s

  let add_extra ob s =
    Buffer.add_string ob.extra s; Buffer.add_char ob.extra '\n'

  let tag ob t =
    if StringSet.mem t word_breaking_tags then char ob ' ';
    if !Config.html_add_tags then add_extra ob t

  let tag_attr ob t n s =
    let n = String.lowercase_ascii n in
    if Str.string_match !Config.html_tag_attr (t ^ "/" ^ n) 0 then
      if n = "href" || n = "src"
      then add_extra ob (decode_url s)
      else add_extra ob s
end

let ob = Output.create()
let tag = ref ""
let attr_name = ref ""
let attr_value = Buffer.create 128

}

let ws = [' ' '\n' '\r' '\t']
let name = ['A'-'Z' 'a'-'z'] ['A'-'Z' 'a'-'z' '0'-'9' '.' '-']*
let unquotedattrib = 
  [^ '\'' '\"' '>' ' ' '\n' '\r' '\t'] [^ '>' ' ' '\n' '\r' '\t']*

rule main = parse
    "<!"                        (* tolerance *)
      { comment lexbuf; main lexbuf }
  | "<" name
      { let s = Lexing.lexeme lexbuf in
        tag := String.lowercase_ascii (String.sub s 1 (String.length s - 1));
        tagbody lexbuf;
        main lexbuf }
  | "</" name
      { let s = Lexing.lexeme lexbuf in
        tag := String.lowercase_ascii (String.sub s 2 (String.length s - 2));
        tagbody lexbuf;
        main lexbuf }
  | "<"                         (* tolerance *)
      { Output.char ob '<'; main lexbuf }
  | "&"
      { Output.char ob (entity lexbuf); main lexbuf }
  (* This parses UTF-8 encodings of ISO Latin 1 characters in the
     0x80 - 0xFF range *)
  | ['\194'-'\195'] ['\128'-'\191']
      { let a = Char.code (Lexing.lexeme_char lexbuf 0)
        and b = Char.code (Lexing.lexeme_char lexbuf 1) in
        Output.char ob (Char.chr (((a land 0x3) lsl 6) lor (b land 0x3F)));
        main lexbuf }
  | _
      { Output.char ob (Lexing.lexeme_char lexbuf 0); main lexbuf }
  | eof
      { Output.contents ob }

and comment = parse
    ">"                         (* tolerance *)
      { () }
  | _                           (* tolerance *)
      { comment lexbuf }
  | eof                         (* tolerance *)
      { () }

and tagbody = parse
    ">"
      { Output.tag ob !tag }
  | name
      { attr_name := Lexing.lexeme lexbuf;
        tagattrib lexbuf;
        tagbody lexbuf }
  | _                           (* tolerance -- should be ws *)
      { tagbody lexbuf }
  | eof                         (* tolerance *)
      { Output.tag ob !tag }

and tagattrib = parse
    ws* '=' ws* 
      { tagvalue lexbuf }
  | ""
      { Output.tag_attr ob !tag !attr_name "" }

and tagvalue = parse
    "'"
      { Buffer.clear attr_value; singlequoted lexbuf }
  | "\""
      { Buffer.clear attr_value; doublequoted lexbuf }
  | unquotedattrib
      { Output.tag_attr ob !tag !attr_name (Lexing.lexeme lexbuf) }
  | ""                          (* tolerance *)
      { Output.tag_attr ob !tag !attr_name "" }

and singlequoted = parse
    "'" | eof                   (* eof is tolerance *)
      { Output.tag_attr ob !tag !attr_name (Buffer.contents attr_value) }
  | "&"
      { Buffer.add_char attr_value (entity lexbuf); singlequoted lexbuf }
  | _
      { Buffer.add_char attr_value (Lexing.lexeme_char lexbuf 0);
        singlequoted lexbuf }

and doublequoted = parse
    "\"" | eof                 (* eof is tolerance *)
      { Output.tag_attr ob !tag !attr_name (Buffer.contents attr_value) }
  | "&"
      { Buffer.add_char attr_value (entity lexbuf); doublequoted lexbuf }
  | _
      { Buffer.add_char attr_value (Lexing.lexeme_char lexbuf 0);
        doublequoted lexbuf }

and entity = parse
    '#' ['0'-'9']+
      { let s = Lexing.lexeme lexbuf in
        let n = int_of_string (String.sub s 1 (String.length s - 1)) in
        entity_end lexbuf;
        if n >= 0 && n <= 255 then Char.chr n else '\255' }
  | name
      { let s = Lexing.lexeme lexbuf in
        entity_end lexbuf;
        try StringMap.find s entity_table with Not_found -> '\255' }
  | ""                          (* tolerance *)
      { '&' }

and entity_end = parse
    ";" ?
      { () }

{
let extract_text s =
  Output.clear ob; main (Lexing.from_string s)
}