1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
|
(***********************************************************************)
(* *)
(* SpamOracle -- a Bayesian spam filter *)
(* *)
(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
(* *)
(* Copyright 2002 Institut National de Recherche en Informatique et *)
(* en Automatique. This file is distributed under the terms of the *)
(* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *)
(* *)
(***********************************************************************)
(* $Id: mbox.ml,v 1.4 2002/08/26 09:35:25 xleroy Exp $ *)
(* Reading of a mailbox file and splitting into individual messages *)
type t =
{ ic: in_channel;
zipped: bool;
mutable start: string;
buf: Buffer.t }
let open_mbox_file filename =
if Filename.check_suffix filename ".gz" then
{ ic = Unix.open_process_in ("gunzip -c " ^filename);
zipped = true;
start = "";
buf = Buffer.create 50000 }
else
{ ic = open_in filename;
zipped = false;
start = "";
buf = Buffer.create 50000 }
let open_mbox_channel ic =
{ ic = ic;
zipped = false;
start = "";
buf = Buffer.create 50000 }
let read_msg t =
Buffer.clear t.buf;
Buffer.add_string t.buf t.start;
let rec read () =
let line = input_line t.ic in
if String.length line >= 5
&& String.sub line 0 5 = "From "
&& Buffer.length t.buf > 0 then begin
t.start <- (line ^ "\n");
Buffer.contents t.buf
end else begin
Buffer.add_string t.buf line;
Buffer.add_char t.buf '\n';
read ()
end in
try
read()
with End_of_file ->
if Buffer.length t.buf > 0 then begin
t.start <- "";
Buffer.contents t.buf
end else
raise End_of_file
let close_mbox t =
if t.zipped
then ignore(Unix.close_process_in t.ic)
else close_in t.ic
let mbox_file_iter filename fn =
let ic = open_mbox_file filename in
try
while true do fn(read_msg ic) done
with End_of_file ->
close_mbox ic
let mbox_channel_iter inchan fn =
let ic = open_mbox_channel inchan in
try
while true do fn(read_msg ic) done
with End_of_file ->
close_mbox ic
let read_single_msg inchan =
let res = Buffer.create 10000 in
let buf = String.create 1024 in
let rec read () =
let n = input inchan buf 0 (String.length buf) in
if n > 0 then begin
Buffer.add_substring res buf 0 n;
read ()
end in
read ();
Buffer.contents res
|