File: wordsplit.mlp

package info (click to toggle)
spamoracle 1.6-1
links: PTS, VCS
area: main
in suites: bookworm, bullseye
size: 292 kB
sloc: ml: 1,380; makefile: 135
file content (202 lines) | stat: -rw-r--r-- 5,312 bytes
parent folder | download | duplicates (2)
(***********************************************************************)
(*                                                                     *)
(*                 SpamOracle -- a Bayesian spam filter                *)
(*                                                                     *)
(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
(*                                                                     *)
(*  Copyright 2002 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  This file is distributed under the terms of the   *)
(*  GNU Public License version 2.                                      *)
(*                                                                     *)
(***********************************************************************)

(* $Id$ *)

(* Decompose a string into words. *)

{
(* Maximal length of a word *)

let maxlen = 12

(* Map uppercase to lowercase.  Remove ISO Latin 1 accents. *)

let tbl = "\
\000\001\002\003\004\005\006\007\008\t\n\011\012\013\014\015\
\016\017\018\019\020\021\022\023\024\025\026\027\028\029\030\031 \
!\"#$%&'()*+,-./\
0123456789:;<=>?\
@abcdefghijklmno\
pqrstuvwxyz[\\]^_\
`abcdefghijklmno\
pqrstuvwxyz{|}~\127\
\128\129\130\131\132\133\134\135\136\137\138\139\140\141\142\143\
\144\145\146\147\148\149\150\151\152\153\154\155\156\157\158\159\
\160\
\
aaaaaaeceeeeiiii\
noooooouuuuyp\
aaaaaaeceeeeiiii\
noooooouuuuypy"

let normalize s =
  String.map (fun c -> tbl.[Char.code c]) s

(* Check if a string is all uppercase characters *)

let all_uppercase s =
  try
    for i = 0 to String.length s - 1 do
      let c = s.[i] in
      if (c >= 'A' && c <= 'Z')
      || (c >= '\192' && c <= '\214')
      || (c >= '\216' && c <= '\222')
      then ()
      else raise Exit
    done;
    true
  with Exit ->
    false

(* Take first n chars of string *)

let adjust s n =
  if String.length s <= n then s else String.sub s 0 n

(* Reassembly of stretched-out words *)

let reassembly_buffer = Bytes.create maxlen

let previous_words = ref ([] : string list)

let reassemble db action word =
  let rec reass idx = function
  | [] -> []
  | hd :: tl ->
      let l = String.length hd in
      let idx = idx - l in
      if idx < 0 then [] else begin
        Bytes.blit_string hd 0 reassembly_buffer idx l;
        let w = Bytes.sub_string reassembly_buffer idx (maxlen - idx) in
        if db w then action w;
        hd :: reass idx tl
      end in
  let l = String.length word in
  if l >= maxlen then begin
    previous_words := []
  end else begin
    let idx = maxlen - l in
    Bytes.blit_string word 0 reassembly_buffer idx l;
    previous_words := word :: reass idx !previous_words
  end

}

let letter = [
   'a'-'z' 'A'-'Z'
#ifdef FRENCH
   '' '' '' '' '' '' '' '' '' '' '' '' ''
   '' '' '' '' '' '' '' '' '' '' '' '' '' ''
#endif
#ifdef SPANISH
   '' '' '' '' '' '' ''
   '' '' '' '' '' '' ''
#endif
#ifdef ITALIAN
   '' '' '' '' '' ''
   '' '' '' '' '' ''
#endif
#ifdef GERMAN
   '' '' ''
   '' '' '' '' 
#endif
#ifdef PORTUGUESE
   '' '' '' '' '' '' '' '' '' '' '' '' ''
   '' '' '' '' '' '' '' '' '' '' '' '' ''
#endif
]

let word_constituent = letter | '\''

let numeric = ['0'-'9' '.' ',' '$' '%' '\164' (* Euro *)]

let weird_character = ['\127' - '\255']

#ifdef JAPANESE
let jis_snd =
  ['\032'-'\127']

let kanji =
  ['\048'-'\116'] jis_snd

let katakana =
  (['\037'] jis_snd | "\033\060")

let jis_char =
  ['\032'-'\116'] jis_snd

let jis_in =
  "\027$" '(' ? ['@' 'B']

let jis_out =
  "\027(" ['B' 'J' 'H']
#endif

rule split = parse
  | word_constituent +
      { fun db action ->
          let s = Lexing.lexeme lexbuf in
          let l = String.length s in
          if l >= 3 && all_uppercase s then action ("U" ^ string_of_int l);
          let s = normalize s in
          if !Config.reassemble_words then reassemble db action s;
          if l >= 3 && l <= maxlen then action s;
          split lexbuf db action }
  | numeric numeric numeric numeric *
      { fun db action ->
          let s = Lexing.lexeme lexbuf in
          if String.length s <= 8 then action (normalize s);
          split lexbuf db action }
#ifdef JAPANESE
  | jis_in
      { split_jis lexbuf }
#endif
  | weird_character weird_character weird_character weird_character *
      { fun db action ->
          let s = Lexing.lexeme lexbuf in
          action ("W" ^ string_of_int (String.length s));
          split lexbuf db action }
  | eof
      { fun db action -> () }
  | _
      { split lexbuf }

#ifdef JAPANESE
and split_jis = parse
    kanji kanji *
      { fun db action ->
          let s = Lexing.lexeme lexbuf in
          action ("\027$B" ^ adjust s 8 ^ "\027(B");
          split_jis lexbuf db action }
  | katakana katakana katakana katakana *
      { fun db action ->
          let s = Lexing.lexeme lexbuf in
          action ("\027$B" ^ adjust s 12 ^ "\027(B");
          split_jis lexbuf db action }
  | jis_char
      { split_jis lexbuf }
  | eof
      { fun db action -> () }
  | jis_out
      { split lexbuf }
  | _
      { split lexbuf }
#endif

{

let iter action db txt =
  previous_words := [];
  split (Lexing.from_string txt) db action

}