1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
|
(***********************************************************************)
(* *)
(* SpamOracle -- a Bayesian spam filter *)
(* *)
(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
(* *)
(* Copyright 2002 Institut National de Recherche en Informatique et *)
(* en Automatique. This file is distributed under the terms of the *)
(* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *)
(* *)
(***********************************************************************)
(* $Id$ *)
(* Word frequency database *)
exception Error of string
type short = {
s_num_good: int;
s_num_spam: int;
s_freq: (string, int * int) Hashtbl.t
}
type full = {
mutable f_num_good: int;
mutable f_num_spam: int;
f_high_freq: (string, int * int) Hashtbl.t;
f_low_freq: (string, int * int) Hashtbl.t
}
let magic = "Mailscrubber" (* + 4 digits for version number *)
let check_magic filename ic =
let mlen = String.length magic in
let buf = really_input_string ic (mlen + 4) in
if String.sub buf 0 mlen <> magic then
raise(Error(filename ^ ": bad magic number"));
try
int_of_string (String.sub buf mlen 4)
with Failure _ ->
raise(Error(filename ^ ": bad magic number"));
type db_chan = {zipped : bool ; ic : in_channel}
let open_db filename =
if Filename.check_suffix filename ".gz" then
{ ic = Unix.open_process_in ("gunzip -c " ^ filename);
zipped = true; }
else
{ ic = open_in_bin filename ;
zipped = false }
let close_db {zipped = zipped ; ic = ic } =
if zipped
then ignore(Unix.close_process_in ic)
else close_in ic
let current_version =
if Sys.ocaml_version < "4.03" then 1
else 2
let read_hashtbl filename ic version =
try
let tbl : ('a, 'b) Hashtbl.t = Marshal.from_channel ic in
if version = current_version then tbl
else if version > current_version then
raise (Error(filename ^ ": database version not supported"))
else begin
Printf.eprintf "%s: converting from version %d to version %d\n\
Run 'spamoracle upgrade' to suppress this warning.\n%!"
filename version current_version;
let tbl' = Hashtbl.create (Hashtbl.length tbl / 3) in
Hashtbl.iter (fun k d -> Hashtbl.add tbl' k d) tbl;
tbl'
end
with Failure _ ->
raise (Error(filename ^ ": database is corrupted"))
let read_short filename =
let {ic=ic ; zipped=zipped} as db_ic = open_db filename in
let version = check_magic filename ic in
let ng = input_binary_int ic in
let ns = input_binary_int ic in
let freq = read_hashtbl filename ic version in
close_db db_ic;
{ s_num_good = ng; s_num_spam = ns; s_freq = freq }
let read_full filename =
let {ic=ic ; zipped=zipped} as db_ic = open_db filename in
let version = check_magic filename ic in
let ng = input_binary_int ic in
let ns = input_binary_int ic in
let high_freq = read_hashtbl filename ic version in
let low_freq = read_hashtbl filename ic version in
close_db db_ic;
{ f_num_good = ng; f_num_spam = ns;
f_low_freq = low_freq; f_high_freq = high_freq }
let temp_file basename =
let pid = Unix.getpid() in
let rec tmpfile counter =
if counter > 10000 then raise (Error "cannot create temporary database");
let filename = basename ^ string_of_int (pid + counter) in
try
(filename,
open_out_gen [Open_wronly; Open_creat; Open_excl; Open_binary] 0o600
filename)
with Sys_error _ ->
tmpfile (counter + 1)
in tmpfile 0
let write_full filename db =
let basename, zip =
if Filename.check_suffix filename ".gz" then
Filename.chop_suffix filename ".gz", true
else
filename, false in
let (tempname, oc) = temp_file (basename ^ ".tmp") in
Printf.fprintf oc "%s%04d" magic current_version;
output_binary_int oc db.f_num_good;
output_binary_int oc db.f_num_spam;
Marshal.to_channel oc db.f_high_freq [Marshal.No_sharing];
Marshal.to_channel oc db.f_low_freq [Marshal.No_sharing];
close_out oc;
if zip then begin
let r = Sys.command ("gzip -best " ^ tempname) in
if r = 0 then
Sys.rename (tempname ^ ".gz") filename
else
Sys.rename tempname basename
end else
Sys.rename tempname filename
let create sz =
{ f_num_good = 0;
f_num_spam = 0;
f_high_freq = Hashtbl.create sz;
f_low_freq = Hashtbl.create sz }
let add_good db w =
begin try
let (g, s as f) = Hashtbl.find db.f_high_freq w in
Hashtbl.replace db.f_high_freq w (g+1, s)
with Not_found ->
try
let (g, s as f) = Hashtbl.find db.f_low_freq w in
let g' = g + 1 in
if 2 * g' + s >= 5 then begin
Hashtbl.remove db.f_low_freq w;
Hashtbl.add db.f_high_freq w (g', s)
end else
Hashtbl.replace db.f_low_freq w (g', s)
with Not_found ->
Hashtbl.add db.f_low_freq w (1, 0)
end
let add_spam db w =
begin try
let (g, s) = Hashtbl.find db.f_high_freq w in
Hashtbl.replace db.f_high_freq w (g, s+1)
with Not_found ->
try
let (g, s) = Hashtbl.find db.f_low_freq w in
let s' = s + 1 in
if 2 * g + s' >= 5 then begin
Hashtbl.remove db.f_low_freq w;
Hashtbl.add db.f_high_freq w (g, s')
end else
Hashtbl.replace db.f_low_freq w (g, s')
with Not_found ->
Hashtbl.add db.f_low_freq w (0, 1)
end
open Printf
let dump db oc =
let dump_entry w (g, s) = fprintf oc "%s %d %d\n" w g s in
fprintf oc "SPAMORACLE/1 %d %d\n" db.f_num_good db.f_num_spam;
Hashtbl.iter dump_entry db.f_high_freq;
Hashtbl.iter dump_entry db.f_low_freq
let split s =
try
let i = String.index s ' ' in
let j = String.index_from s (i + 1) ' ' in
(String.sub s 0 i,
int_of_string (String.sub s (i + 1) (j - i - 1)),
int_of_string (String.sub s (j + 1) (String.length s - j - 1)))
with Not_found ->
raise(Error("Database restoration: ill-formed line `"
^ String.escaped s ^ "'"))
let restore ic =
let db = create 997 in
begin try
let (w, ng, ns) = split (input_line ic) in
if w <> "SPAMORACLE/1"
then raise (Error("Database restoration: wrong version"));
db.f_num_good <- ng;
db.f_num_spam <- ns
with End_of_file ->
raise (Error("Database restoration: first line missing"));
end;
begin try
while true do
let (w, g, s) = split (input_line ic) in
if 2 * g + s >= 5
then Hashtbl.add db.f_high_freq w (g, s)
else Hashtbl.add db.f_low_freq w (g, s)
done
with End_of_file ->
()
end;
db
let in_short db w = Hashtbl.mem db.s_freq w
let in_full db w = Hashtbl.mem db.f_high_freq w
|