File: train.pl

package info (click to toggle)
dspam 3.6.8-5etch1
  • links: PTS
  • area: main
  • in suites: etch
  • size: 4,372 kB
  • ctags: 1,457
  • sloc: ansic: 24,738; sh: 9,860; perl: 2,378; makefile: 546; sql: 327
file content (102 lines) | stat: -rwxr-xr-x 2,848 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/perl

# train.pl
# This tool trains a corpus of messages (a directory containing a nonspam and
# a spam subdirectory, each with messages in individual files) to a specific
# user and reports on errors. 
#
# You can use four different training paradigms; the three stock training modes
# included with dspam (teft, toe, tum) or tune (train until no errors) by 
# setting toe mode training and rerunning this script until little or no errors
# are generated.

use strict;
use vars qw { $USER $PATH $REPORTING_WINDOW $CORPUS $TRAINING_MODE };

$REPORTING_WINDOW  = 250;			# How often to summarize
$PATH              = "/usr/local/dspam/bin";	# Path to dspam binaries
$TRAINING_MODE     = "teft";			# Training mode

### DO NOT CONFIGURE BELOW THIS LINE ###

$USER = shift;
$CORPUS = shift;

if ($CORPUS eq "") {
  die "Usage: $0 [username] [corpus]";
}

&Train("$CORPUS/nonspam", "$CORPUS/spam");

sub Train {
  my($nonspam, $spam) = @_;
  my(@nonspam_corpus, @spam_corpus);
  my($ic, $sc, $fp, $sm, $num);

  print "Training $nonspam / $spam corpora...\n";
  @nonspam_corpus = GetFiles($nonspam);
  @spam_corpus = GetFiles($spam); 

  $ic = $sc = $fp = $sm = $num = 0; 

  while($#nonspam_corpus && $#spam_corpus) {
    my($nonspam_msg) = shift(@nonspam_corpus);
    my($spam_msg) = shift(@spam_corpus);
    my($cmd, $response);
 
    # Process one nonspam

    $cmd = "$PATH/dspam --user $USER --mode=$TRAINING_MODE --deliver=stdout < $nonspam/$nonspam_msg";
    $response = `$cmd`;
    if ($response =~ /X-DSPAM-Result: (Innocent|Whitelisted)/i) {
      $ic++;
    } else {
      $fp++;
      print "FP: $nonspam_msg\n";
      open(TRAIN, "|$PATH/dspam --user $USER --mode=$TRAINING_MODE --class=innocent --source=error");
      print TRAIN $response;
      close(TRAIN);
    }

    # Process one spam

    $cmd = "$PATH/dspam --user $USER --mode=$TRAINING_MODE --deliver=stdout < $spam/$spam_msg";
    $response = `$cmd`;
    if ($response =~ /X-DSPAM-Result: Spam/i) {
      $sc++;
    } else {
      print "SM: $spam_msg\n";
      $sm++;
      open(TRAIN, "|$PATH/dspam --user $USER --mode=$TRAINING_MODE --class=spam --source=error");
      print TRAIN $response;
      close(TRAIN);
    }
    $num+=2;

    if ($num % $REPORTING_WINDOW == 0) {
      print "Spam Correct   : $sc\n";
      print "Spam Missed    : $sm\n";
      print "Nonspam Correct: $ic\n";
      print "Nonspam Missed : $fp\n"; 
      print "--------------------\n";
      $sc = $sm = $ic = $fp = $num = 0;
    }
  }

  print "Spam Correct   : $sc\n";
  print "Spam Missed    : $sm\n";
  print "Nonspam Correct: $ic\n";
  print "Nonspam Missed : $fp\n\n";

  system("$PATH/dspam_stats -S $USER");
}

sub GetFiles {
  my($corpus) = @_;
  my(@files);

  opendir(DIR, "./$corpus") || die $!;
  @files = grep(!/^\./, readdir(DIR));
  closedir(DIR);
  return @files;
}