File: database.ml

package info (click to toggle)
spamoracle 1.6-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 296 kB
  • sloc: ml: 1,380; makefile: 135
file content (219 lines) | stat: -rw-r--r-- 6,920 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
(***********************************************************************)
(*                                                                     *)
(*                 SpamOracle -- a Bayesian spam filter                *)
(*                                                                     *)
(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
(*                                                                     *)
(*  Copyright 2002 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  This file is distributed under the terms of the   *)
(*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
(*                                                                     *)
(***********************************************************************)

(* $Id$ *)

(* Word frequency database *)

exception Error of string

type short = {
  s_num_good: int;
  s_num_spam: int;
  s_freq: (string, int * int) Hashtbl.t
}

type full = {
  mutable f_num_good: int;
  mutable f_num_spam: int;
  f_high_freq: (string, int * int) Hashtbl.t;
  f_low_freq: (string, int * int) Hashtbl.t
}

let magic = "Mailscrubber"  (* + 4 digits for version number *)

let check_magic filename ic =
  let mlen = String.length magic in
  let buf = really_input_string ic (mlen + 4) in
  if String.sub buf 0 mlen <> magic then
    raise(Error(filename ^ ": bad magic number"));
  try
    int_of_string (String.sub buf mlen 4)
  with Failure _ ->
    raise(Error(filename ^ ": bad magic number"));

type db_chan = {zipped : bool ; ic : in_channel}

let open_db filename =
  if Filename.check_suffix filename ".gz" then
    { ic = Unix.open_process_in ("gunzip -c " ^ filename);
      zipped = true; }
  else
    { ic = open_in_bin filename ;
      zipped = false }

let close_db {zipped = zipped ; ic = ic } =
  if zipped 
  then ignore(Unix.close_process_in ic)
  else close_in ic

let current_version =
  if Sys.ocaml_version < "4.03" then 1
  else 2

let read_hashtbl filename ic version =
  try
    let tbl : ('a, 'b) Hashtbl.t = Marshal.from_channel ic in
    if version = current_version then tbl
    else if version > current_version then
      raise (Error(filename ^ ": database version not supported"))
    else begin
      Printf.eprintf "%s: converting from version %d to version %d\n\
                      Run 'spamoracle upgrade' to suppress this warning.\n%!"
                     filename version current_version;
      let tbl' = Hashtbl.create (Hashtbl.length tbl / 3) in
      Hashtbl.iter (fun k d -> Hashtbl.add tbl' k d) tbl;
      tbl'
    end
  with Failure _ ->
    raise (Error(filename ^ ": database is corrupted"))

let read_short filename =
  let {ic=ic ; zipped=zipped} as db_ic = open_db filename in
  let version = check_magic filename ic in
  let ng = input_binary_int ic in
  let ns = input_binary_int ic in
  let freq = read_hashtbl filename ic version in
  close_db db_ic;
  { s_num_good = ng; s_num_spam = ns; s_freq = freq }

let read_full filename =
  let {ic=ic ; zipped=zipped} as db_ic = open_db filename in
  let version = check_magic filename ic in
  let ng = input_binary_int ic in
  let ns = input_binary_int ic in
  let high_freq = read_hashtbl filename ic version in
  let low_freq = read_hashtbl filename ic version in
  close_db db_ic;
  { f_num_good = ng; f_num_spam = ns; 
    f_low_freq = low_freq; f_high_freq = high_freq }

let temp_file basename =
  let pid = Unix.getpid() in
  let rec tmpfile counter =
    if counter > 10000 then raise (Error "cannot create temporary database");
    let filename = basename ^ string_of_int (pid + counter) in
    try
      (filename,
       open_out_gen [Open_wronly; Open_creat; Open_excl; Open_binary] 0o600
                    filename)
    with Sys_error _ ->
      tmpfile (counter + 1)
  in tmpfile 0

let write_full filename db =
  let basename, zip =
    if Filename.check_suffix filename ".gz" then
      Filename.chop_suffix filename ".gz", true
    else
      filename, false in
  let (tempname, oc) = temp_file (basename ^ ".tmp") in
  Printf.fprintf oc "%s%04d" magic current_version;
  output_binary_int oc db.f_num_good;
  output_binary_int oc db.f_num_spam;
  Marshal.to_channel oc db.f_high_freq [Marshal.No_sharing];
  Marshal.to_channel oc db.f_low_freq [Marshal.No_sharing];
  close_out oc;
  if zip then begin
    let r = Sys.command ("gzip -best " ^ tempname) in
    if r = 0 then
      Sys.rename (tempname ^ ".gz") filename
    else
      Sys.rename tempname basename
  end else
    Sys.rename tempname filename 


let create sz =
  { f_num_good = 0;
    f_num_spam = 0;
    f_high_freq = Hashtbl.create sz;
    f_low_freq = Hashtbl.create sz }

let add_good db w =
  begin try
    let (g, s as f) = Hashtbl.find db.f_high_freq w in
    Hashtbl.replace db.f_high_freq w (g+1, s)
  with Not_found ->
  try
    let (g, s as f) = Hashtbl.find db.f_low_freq w in
    let g' = g + 1 in
    if 2 * g' + s >= 5 then begin
      Hashtbl.remove db.f_low_freq w;
      Hashtbl.add db.f_high_freq w (g', s)
    end else
      Hashtbl.replace db.f_low_freq w (g', s)
  with Not_found ->
    Hashtbl.add db.f_low_freq w (1, 0)
  end

let add_spam db w =
  begin try
    let (g, s) = Hashtbl.find db.f_high_freq w in
    Hashtbl.replace db.f_high_freq w (g, s+1)
  with Not_found ->
  try
    let (g, s) = Hashtbl.find db.f_low_freq w in
    let s' = s + 1 in
    if 2 * g + s' >= 5 then begin
      Hashtbl.remove db.f_low_freq w;
      Hashtbl.add db.f_high_freq w (g, s')
    end else
      Hashtbl.replace db.f_low_freq w (g, s')
  with Not_found ->
    Hashtbl.add db.f_low_freq w (0, 1)
  end

open Printf

let dump db oc =
  let dump_entry w (g, s) = fprintf oc "%s %d %d\n" w g s in
  fprintf oc "SPAMORACLE/1 %d %d\n" db.f_num_good db.f_num_spam;
  Hashtbl.iter dump_entry db.f_high_freq;
  Hashtbl.iter dump_entry db.f_low_freq

let split s =
  try
    let i = String.index s ' ' in
    let j = String.index_from s (i + 1) ' ' in
    (String.sub s 0 i,
     int_of_string (String.sub s (i + 1) (j - i - 1)),
     int_of_string (String.sub s (j + 1) (String.length s - j - 1)))
  with Not_found ->
    raise(Error("Database restoration: ill-formed line `"
                ^ String.escaped s ^ "'"))

let restore ic =
  let db = create 997 in
  begin try
    let (w, ng, ns) = split (input_line ic) in
    if w <> "SPAMORACLE/1"
    then raise (Error("Database restoration: wrong version"));
    db.f_num_good <- ng;  
    db.f_num_spam <- ns
  with End_of_file ->
    raise (Error("Database restoration: first line missing"));
  end;
  begin try
    while true do
      let (w, g, s) = split (input_line ic) in
      if 2 * g + s >= 5
      then Hashtbl.add db.f_high_freq w (g, s)
      else Hashtbl.add db.f_low_freq w (g, s)
    done
  with End_of_file ->
    ()
  end;
  db

let in_short db w = Hashtbl.mem db.s_freq w
let in_full  db w = Hashtbl.mem db.f_high_freq w