File: main.ml

package info (click to toggle)
spamoracle 1.6-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 296 kB
  • sloc: ml: 1,380; makefile: 135
file content (283 lines) | stat: -rw-r--r-- 9,381 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
(***********************************************************************)
(*                                                                     *)
(*                 SpamOracle -- a Bayesian spam filter                *)
(*                                                                     *)
(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
(*                                                                     *)
(*  Copyright 2002 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  This file is distributed under the terms of the   *)
(*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
(*                                                                     *)
(***********************************************************************)

(* $Id$ *)

(* Argument parsing and main program *)

open Printf
open Mbox
open Database
open Processing

exception Usage of string

let default_config_name =
  try Filename.concat (Sys.getenv "HOME") ".spamoracle.conf"
  with Not_found -> ".spamoracle.conf"

let parse_config_file file =
  try
    let errs = Configfile.parse Config.options file in
    if errs <> [] then begin
      eprintf "Error while reading configuration file %s:\n" file;
      List.iter (fun (line, msg) -> eprintf "Line %d: %s\n" line msg) errs;
      exit 2
    end
  with Sys_error msg ->
    eprintf "Cannot read configuration file %s:\n%s\n" file msg

let mark_command args =
  let db = Database.read_short !Config.database_name in
  if args = [] then
    mark_message db (read_single_msg stdin)
  else
    List.iter (fun f -> mbox_file_iter f (mark_message db)) args

let add_command args =
  let db =
    try
      Database.read_full !Config.database_name
    with Sys_error _ ->
      Database.create 997 in
  let processed = ref false
  and is_spam = ref true
  and verbose = ref false in
  let rec parse_args = function
    | "-v" :: rem ->
        verbose := true; parse_args rem
    | "-spam" :: rem ->
        is_spam := true; parse_args rem
    | "-good" :: rem ->
        is_spam := false; parse_args rem
    | f :: rem ->
        mbox_file_iter f (add_message db !verbose !is_spam);
        processed := true;
        parse_args rem
    | [] ->
        if not !processed then
          add_message db !verbose !is_spam (read_single_msg stdin);
        if !verbose then
          printf "\r%6d / %6d  good / spam messages\n"
                 db.f_num_good db.f_num_spam
  in parse_args args; Database.write_full !Config.database_name db

let list_command args =
  let db = Database.read_full !Config.database_name in
  let res = ref [] in
  List.iter
    (fun s ->
      let re = Str.regexp (s ^ "$") in
      let match_word w (g, s) =
        if Str.string_match re w 0 then begin
          let p =
            if 2 * g + s < 5
            then -1.0
            else Rankmsg.word_proba g s db.f_num_good db.f_num_spam in
          res := (w, p, g, s) :: !res
        end in
      Hashtbl.iter match_word db.f_high_freq;
      Hashtbl.iter match_word db.f_low_freq)
    args;
  if !res = [] then
    Printf.printf "No matching word found in database.\n"
  else begin
    Printf.printf "     Word         Proba   #good   #spam\n";
    List.iter
      (fun (w, p, g, s) ->
        if p >= 0.0 then
          Printf.printf "%-15s%8.2f%8d%8d\n" w p g s
        else
          Printf.printf "%-15s    ----%8d%8d\n" w g s)
      (List.sort
        (fun (_, p1, _, _) (_, p2, _, _) -> compare p2 p1)
        !res)
  end

let test_command args =
  let db = Database.read_short !Config.database_name in
  let low = ref 0.0 and high = ref 1.0 in
  let rec parse_args = function
    | "-min" :: s :: rem ->
        begin try low := float_of_string s
        with Failure _ -> raise(Usage("bad argument to -min"))
        end;
        parse_args rem
    | "-min" :: [] ->
        raise(Usage("no argument to -min"))
    | "-max" :: s :: rem ->
        begin try high := float_of_string s
        with Failure _ -> raise(Usage("bad argument to -max"))
        end;
        parse_args rem
    | "-max" :: [] ->
        raise(Usage("no argument to -max"))
    | f :: rem ->
        mbox_file_iter f (test_message db !low !high f);
        parse_args rem
    | [] -> ()
  in parse_args args

let stat_command args =
  let db = Database.read_short !Config.database_name in
  let stat_mbox f =
    let num_msgs = ref 0
    and num_good = ref 0
    and num_spam = ref 0
    and num_unknown = ref 0 in
    mbox_file_iter f
      (fun s ->
        incr num_msgs;
        match stat_message db s with
          Msg_good -> incr num_good
        | Msg_spam -> incr num_spam
        | Msg_unknown -> incr num_unknown);
    let percentage a b =
      100.0 *. float a /. float b in
    if !num_msgs > 0 then
      printf "%s: %d (%.2f%%) good, %d (%.2f%%) unknown, %d (%.2f%%) spam\n"
             f 
             !num_good (percentage !num_good !num_msgs)
             !num_unknown (percentage !num_unknown !num_msgs)
             !num_spam (percentage !num_spam !num_msgs)
  in List.iter stat_mbox args

let words_command args =
  let db = Database.read_short !Config.database_name in
  if args = [] then
    wordsplit_message db (read_single_msg stdin)
  else
    List.iter
      (fun f ->
        mbox_file_iter f
          (fun msg ->
            print_string "----------------------------------------\n";
            wordsplit_message db msg))
      args

let backup_command () =
  Database.dump (Database.read_full !Config.database_name) stdout

let restore_command () =
  Database.write_full !Config.database_name (Database.restore stdin)

let upgrade_command () =
  let db = Database.read_full !Config.database_name in
  Database.write_full !Config.database_name db;
  printf "Converted %s to version %d.\n"
         !Config.database_name
         Database.current_version

let rec parse_args_1 = function
    "-config" :: file :: rem ->
      parse_config_file file; parse_args_2 rem
  | "-config" :: [] ->
      raise(Usage("Option -config requires an argument"))
  | rem ->
      if Sys.file_exists default_config_name
      then parse_config_file default_config_name;
      parse_args_2 rem

and parse_args_2 = function
  | "-f" :: file :: rem ->
      Config.database_name := file; parse_args_3 rem
  | "-f" :: [] ->
      raise(Usage("Option -f requires an argument"))
  | rem ->
      parse_args_3 rem

and parse_args_3 = function
    "mark" :: rem ->
      mark_command rem
  | "add" :: rem ->
      add_command rem
  | "list" :: rem ->
      list_command rem
  | "test" :: rem ->
      test_command rem
  | "stat" :: rem ->
      stat_command rem
  | "backup" :: rem ->
      backup_command ()
  | "restore" :: rem ->
      restore_command ()
  | "words" :: rem ->
      words_command rem
  | "upgrade" :: rem ->
      upgrade_command ()
  | s :: rem ->
      raise(Usage("Unknown command " ^ s))
  | [] ->
      raise(Usage "")

let usage_string = "\
Usage:
  spamoracle [-config conf] [-f db] mark {mailbox}*
  Add 'X-Spam:' headers to messages with result of analysis
    {mailbox}*   Mailboxes containing messages to analyze and mark
                 If none given, read single msg from standard input

  spamoracle [-config conf] [-f db] add [-v] -spam {spambox}* -good {goodbox}*
  Create or update database with known spam or non-spam messages
    -v           Print progress bar
    -spam        Indicate subsequent mailboxes contain spam
    -good        Indicate subsequent mailboxes contain good msgs (not spam)
    {spambox}*   Mailboxes containing spam
    {goodbox}*   Mailboxes containing good messages (not spam)
                 If no mailbox given, read single msg from standard input

  spamoracle [-config conf] [-f db] test [-min prob] [-max prob] {mailbox}*
  Analyze messages and print summary of results for each message
    -min <prob>  Don't print messages with result below <prob>   
    -max <prob>  Don't print messages with result above <prob>   
    {mailbox}*   Mailboxes containing messages to analyze

  spamoracle [-config conf] [-f db] stat {mailbox}*
  Analyze messages and print percentages of spam/non-spam for each mailbox
    {mailbox}*   Mailboxes containing messages to analyze

  spamoracle [-config conf] [-f db] list {regexp}*
  Dump word statistics in database
    {regexp}*    Regular expressions for words we are interested in

  spamoracle [-config conf] [-f db] backup > database.backup
  Dump whole database in portable text format on standard output

  spamoracle [-config conf] [-f db] restore < database.backup
  Restore database from text backup file read from standard input

  spamoracle [-config conf] [-f db] upgrade
  Convert database to the latest format

  spamoracle [-config conf] [-f db] words {mailbox}*
  Extract words from messages and print them
    {mailbox}*   Mailboxes containing messages to scan
                 If no mailbox given, read single msg from standard input

  Common options:
    -config <conf> Configuration file (default $HOME/.spamoracle.conf)
    -f <db>        Database to use (default $HOME/.spamoracle.db)"

let main () =
  try
    parse_args_1 (List.tl (Array.to_list Sys.argv))
  with
  | Usage msg ->
      eprintf "%s\n%s\n" msg usage_string;
      exit 2
  | Sys_error msg ->
      eprintf "System error: %s\n" msg
  | Database.Error msg ->
      eprintf "%s\n" msg

let _ = main()