1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
|
(***********************************************************************)
(* *)
(* SpamOracle -- a Bayesian spam filter *)
(* *)
(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
(* *)
(* Copyright 2002 Institut National de Recherche en Informatique et *)
(* en Automatique. This file is distributed under the terms of the *)
(* GNU Public License version 2. *)
(* *)
(***********************************************************************)
(* $Id$ *)
(* Decompose a string into words. *)
{
(* Maximal length of a word *)
let maxlen = 12
(* Map uppercase to lowercase. Remove ISO Latin 1 accents. *)
let tbl = "\
\000\001\002\003\004\005\006\007\008\t\n\011\012\013\014\015\
\016\017\018\019\020\021\022\023\024\025\026\027\028\029\030\031 \
!\"#$%&'()*+,-./\
0123456789:;<=>?\
@abcdefghijklmno\
pqrstuvwxyz[\\]^_\
`abcdefghijklmno\
pqrstuvwxyz{|}~\127\
\128\129\130\131\132\133\134\135\136\137\138\139\140\141\142\143\
\144\145\146\147\148\149\150\151\152\153\154\155\156\157\158\159\
\160\
\
aaaaaaeceeeeiiii\
noooooouuuuyp\
aaaaaaeceeeeiiii\
noooooouuuuypy"
let normalize s =
String.map (fun c -> tbl.[Char.code c]) s
(* Check if a string is all uppercase characters *)
let all_uppercase s =
try
for i = 0 to String.length s - 1 do
let c = s.[i] in
if (c >= 'A' && c <= 'Z')
|| (c >= '\192' && c <= '\214')
|| (c >= '\216' && c <= '\222')
then ()
else raise Exit
done;
true
with Exit ->
false
(* Take first n chars of string *)
let adjust s n =
if String.length s <= n then s else String.sub s 0 n
(* Reassembly of stretched-out words *)
let reassembly_buffer = Bytes.create maxlen
let previous_words = ref ([] : string list)
let reassemble db action word =
let rec reass idx = function
| [] -> []
| hd :: tl ->
let l = String.length hd in
let idx = idx - l in
if idx < 0 then [] else begin
Bytes.blit_string hd 0 reassembly_buffer idx l;
let w = Bytes.sub_string reassembly_buffer idx (maxlen - idx) in
if db w then action w;
hd :: reass idx tl
end in
let l = String.length word in
if l >= maxlen then begin
previous_words := []
end else begin
let idx = maxlen - l in
Bytes.blit_string word 0 reassembly_buffer idx l;
previous_words := word :: reass idx !previous_words
end
}
let letter = [
'a'-'z' 'A'-'Z'
#ifdef FRENCH
'' '' '' '' '' '' '' '' '' '' '' '' ''
'' '' '' '' '' '' '' '' '' '' '' '' '' ''
#endif
#ifdef SPANISH
'' '' '' '' '' '' ''
'' '' '' '' '' '' ''
#endif
#ifdef ITALIAN
'' '' '' '' '' ''
'' '' '' '' '' ''
#endif
#ifdef GERMAN
'' '' ''
'' '' '' ''
#endif
#ifdef PORTUGUESE
'' '' '' '' '' '' '' '' '' '' '' '' ''
'' '' '' '' '' '' '' '' '' '' '' '' ''
#endif
]
let word_constituent = letter | '\''
let numeric = ['0'-'9' '.' ',' '$' '%' '\164' (* Euro *)]
let weird_character = ['\127' - '\255']
#ifdef JAPANESE
let jis_snd =
['\032'-'\127']
let kanji =
['\048'-'\116'] jis_snd
let katakana =
(['\037'] jis_snd | "\033\060")
let jis_char =
['\032'-'\116'] jis_snd
let jis_in =
"\027$" '(' ? ['@' 'B']
let jis_out =
"\027(" ['B' 'J' 'H']
#endif
rule split = parse
| word_constituent +
{ fun db action ->
let s = Lexing.lexeme lexbuf in
let l = String.length s in
if l >= 3 && all_uppercase s then action ("U" ^ string_of_int l);
let s = normalize s in
if !Config.reassemble_words then reassemble db action s;
if l >= 3 && l <= maxlen then action s;
split lexbuf db action }
| numeric numeric numeric numeric *
{ fun db action ->
let s = Lexing.lexeme lexbuf in
if String.length s <= 8 then action (normalize s);
split lexbuf db action }
#ifdef JAPANESE
| jis_in
{ split_jis lexbuf }
#endif
| weird_character weird_character weird_character weird_character *
{ fun db action ->
let s = Lexing.lexeme lexbuf in
action ("W" ^ string_of_int (String.length s));
split lexbuf db action }
| eof
{ fun db action -> () }
| _
{ split lexbuf }
#ifdef JAPANESE
and split_jis = parse
kanji kanji *
{ fun db action ->
let s = Lexing.lexeme lexbuf in
action ("\027$B" ^ adjust s 8 ^ "\027(B");
split_jis lexbuf db action }
| katakana katakana katakana katakana *
{ fun db action ->
let s = Lexing.lexeme lexbuf in
action ("\027$B" ^ adjust s 12 ^ "\027(B");
split_jis lexbuf db action }
| jis_char
{ split_jis lexbuf }
| eof
{ fun db action -> () }
| jis_out
{ split lexbuf }
| _
{ split lexbuf }
#endif
{
let iter action db txt =
previous_words := [];
split (Lexing.from_string txt) db action
}
|