File: database.ml

package info (click to toggle)
spamoracle 1.4-12
  • links: PTS, VCS
  • area: main
  • in suites: lenny
  • size: 256 kB
  • ctags: 245
  • sloc: ml: 1,198; makefile: 145; sh: 74
file content (196 lines) | stat: -rw-r--r-- 6,045 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
(***********************************************************************)
(*                                                                     *)
(*                 SpamOracle -- a Bayesian spam filter                *)
(*                                                                     *)
(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
(*                                                                     *)
(*  Copyright 2002 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  This file is distributed under the terms of the   *)
(*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
(*                                                                     *)
(***********************************************************************)

(* $Id: database.ml,v 1.8 2003/05/30 11:31:38 xleroy Exp $ *)

(* Word frequency database *)

exception Error of string

type short = {
  s_num_good: int;
  s_num_spam: int;
  s_freq: (string, int * int) Hashtbl.t
}

type full = {
  mutable f_num_good: int;
  mutable f_num_spam: int;
  f_high_freq: (string, int * int) Hashtbl.t;
  f_low_freq: (string, int * int) Hashtbl.t
}

let magic = "Mailscrubber0001"

let check_magic filename ic =
  let buf = String.create (String.length magic) in
  really_input ic buf 0 (String.length magic);
  if buf <> magic then raise(Error(filename ^ ": bad magic number"))

type db_chan = {zipped : bool ; ic : in_channel}

let open_db filename =
  if Filename.check_suffix filename ".gz" then
    { ic = Unix.open_process_in ("gunzip -c " ^ filename);
      zipped = true; }
  else
    { ic = open_in_bin filename ;
      zipped = false }

let close_db {zipped = zipped ; ic = ic } =
  if zipped 
  then ignore(Unix.close_process_in ic)
  else close_in ic

let marshal_from_channel filename ic =
  try
    Marshal.from_channel ic
  with Failure _ ->
    raise (Error(filename ^ ": database is corrupted"))

let read_short filename =
  let {ic=ic ; zipped=zipped} as db_ic = open_db filename in
  check_magic filename ic;
  let ng = input_binary_int ic in
  let ns = input_binary_int ic in
  let freq = marshal_from_channel filename ic in
  close_db db_ic;
  { s_num_good = ng; s_num_spam = ns; s_freq = freq }

let read_full filename =
  let {ic=ic ; zipped=zipped} as db_ic = open_db filename in
  check_magic filename ic;
  let ng = input_binary_int ic in
  let ns = input_binary_int ic in
  let high_freq = marshal_from_channel filename ic in
  let low_freq = marshal_from_channel filename ic in
  close_db db_ic;
  { f_num_good = ng; f_num_spam = ns; 
    f_low_freq = low_freq; f_high_freq = high_freq }

let temp_file basename =
  let pid = Unix.getpid() in
  let rec tmpfile counter =
    if counter > 10000 then raise (Error "cannot create temporary database");
    let filename = basename ^ string_of_int (pid + counter) in
    try
      (filename,
       open_out_gen [Open_wronly; Open_creat; Open_excl; Open_binary] 0o600
                    filename)
    with Sys_error _ ->
      tmpfile (counter + 1)
  in tmpfile 0

let write_full filename db =
  let basename, zip =
    if Filename.check_suffix filename ".gz" then
      Filename.chop_suffix filename ".gz", true
    else
      filename, false in
  let (tempname, oc) = temp_file (basename ^ ".tmp") in
  output_string oc magic;
  output_binary_int oc db.f_num_good;
  output_binary_int oc db.f_num_spam;
  Marshal.to_channel oc db.f_high_freq [Marshal.No_sharing];
  Marshal.to_channel oc db.f_low_freq [Marshal.No_sharing];
  close_out oc;
  if zip then begin
    let r = Sys.command ("gzip -best " ^ tempname) in
    if r = 0 then
      Sys.rename (tempname ^ ".gz") filename
    else
      Sys.rename tempname basename
  end else
    Sys.rename tempname filename 


let create sz =
  { f_num_good = 0;
    f_num_spam = 0;
    f_high_freq = Hashtbl.create sz;
    f_low_freq = Hashtbl.create sz }

let add_good db w =
  begin try
    let (g, s as f) = Hashtbl.find db.f_high_freq w in
    Hashtbl.replace db.f_high_freq w (g+1, s)
  with Not_found ->
  try
    let (g, s as f) = Hashtbl.find db.f_low_freq w in
    let g' = g + 1 in
    if 2 * g' + s >= 5 then begin
      Hashtbl.remove db.f_low_freq w;
      Hashtbl.add db.f_high_freq w (g', s)
    end else
      Hashtbl.replace db.f_low_freq w (g', s)
  with Not_found ->
    Hashtbl.add db.f_low_freq w (1, 0)
  end

let add_spam db w =
  begin try
    let (g, s) = Hashtbl.find db.f_high_freq w in
    Hashtbl.replace db.f_high_freq w (g, s+1)
  with Not_found ->
  try
    let (g, s) = Hashtbl.find db.f_low_freq w in
    let s' = s + 1 in
    if 2 * g + s' >= 5 then begin
      Hashtbl.remove db.f_low_freq w;
      Hashtbl.add db.f_high_freq w (g, s')
    end else
      Hashtbl.replace db.f_low_freq w (g, s')
  with Not_found ->
    Hashtbl.add db.f_low_freq w (0, 1)
  end

open Printf

let dump db oc =
  let dump_entry w (g, s) = fprintf oc "%s %d %d\n" w g s in
  fprintf oc "SPAMORACLE/1 %d %d\n" db.f_num_good db.f_num_spam;
  Hashtbl.iter dump_entry db.f_high_freq;
  Hashtbl.iter dump_entry db.f_low_freq

let split s =
  try
    let i = String.index s ' ' in
    let j = String.index_from s (i + 1) ' ' in
    (String.sub s 0 i,
     int_of_string (String.sub s (i + 1) (j - i - 1)),
     int_of_string (String.sub s (j + 1) (String.length s - j - 1)))
  with Not_found ->
    raise(Error("Database restoration: ill-formed line `"
                ^ String.escaped s ^ "'"))

let restore ic =
  let db = create 997 in
  begin try
    let (w, ng, ns) = split (input_line ic) in
    if w <> "SPAMORACLE/1"
    then raise (Error("Database restoration: wrong version"));
    db.f_num_good <- ng;  
    db.f_num_spam <- ns
  with End_of_file ->
    raise (Error("Database restoration: first line missing"));
  end;
  begin try
    while true do
      let (w, g, s) = split (input_line ic) in
      if 2 * g + s >= 5
      then Hashtbl.add db.f_high_freq w (g, s)
      else Hashtbl.add db.f_low_freq w (g, s)
    done
  with End_of_file ->
    ()
  end;
  db