File: refhosts.ml

package info (click to toggle)
spamoracle 1.6-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, trixie
  • size: 296 kB
  • sloc: ml: 1,380; makefile: 135
file content (43 lines) | stat: -rw-r--r-- 1,528 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
(***********************************************************************)
(*                                                                     *)
(*                 SpamOracle -- a Bayesian spam filter                *)
(*                                                                     *)
(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
(*                                                                     *)
(*  Copyright 2002 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  This file is distributed under the terms of the   *)
(*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
(*                                                                     *)
(***********************************************************************)

(* $Id$ *)

(* Extract hostnames or IP addresses referenced from http URLs
   in message bodies. *)

let re_url =
  Str.regexp_case_fold
    "http://\\([^@]+@\\)?\\([a-z0-9-]+\\(\\.[a-z0-9-]+\\)+\\)"

module StringSet = Set.Make(String)

let hosts = ref StringSet.empty

let reset() = hosts := StringSet.empty

let rec add_urls txt pos =
  let matched =
    try ignore (Str.search_forward re_url txt pos); true
    with Not_found -> false in
  if matched then begin
    hosts := StringSet.add (Str.matched_group 2 txt) !hosts;
    add_urls txt (Str.match_end())
  end

let add txt =
  add_urls txt 0

let summarize () =
  let lst = StringSet.elements !hosts in
  hosts := StringSet.empty;
  String.concat " " lst