File: split_corpora

package info (click to toggle)
spamassassin 3.1.7-2
  • links: PTS
  • area: main
  • in suites: etch-m68k
  • size: 5,376 kB
  • ctags: 2,123
  • sloc: perl: 39,706; ansic: 3,133; sh: 1,344; sql: 170; makefile: 168
file content (94 lines) | stat: -rwxr-xr-x 1,924 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/perl -w

use FindBin;
use lib "$FindBin::Bin/../lib";

use strict;

use Mail::SpamAssassin::ArchiveIterator;
use Getopt::Std;
use FileHandle;

###########

sub usage {
  print STDERR "split-corpora [-n num_buckets] [-p outfile_prefix] ".
	"[-l max_messages] ".
	"folder1 ....\n";
  exit(1);
} # usage()

###########

our ($opt_n, $opt_p, $opt_h, $opt_l);

getopt('n:p:l:h');

usage() if ($opt_h);

my $num_buckets = $opt_n || 2;
my $prefix      = $opt_p || "bucket";
my @IN_FILES    = @ARGV;

usage() if (@IN_FILES == 0);

my @targets = ();
foreach (@IN_FILES) {
  if (-d $_) {
    push (@targets, "ham:dir:$_");
  } else {
    push (@targets, "ham:mbox:$_");
  }
}

my @bucket_fhs = ();
foreach my $bucket (1 .. $num_buckets) {
  my $bucket_fh = new FileHandle();

  if (!$bucket_fh->open(">$prefix.$bucket")) {
    die "Could not open '$prefix.$bucket' for writing: $!\n";
  }

  push(@bucket_fhs, $bucket_fh);
} # foreach my $bucket (1 .. $num_buckets)

my $current_bucket = 0;

my $iter = new Mail::SpamAssassin::ArchiveIterator({
        'opt_j' => 1,
        'opt_n' => 1,
        'opt_all' => 1,
  });

$iter->set_functions(\&wanted, sub { });
my $messagecount = 0;

eval {
  $iter->run(@targets);
};
if ($@) { die $@ unless ($@ =~ /HITLIMIT/); }

foreach my $fh (@bucket_fhs) {
  $fh->close();
}
if ($opt_l && $messagecount < $opt_l) {
  warn "warning: only found $messagecount messages instead of $opt_l\n";
}

#############################################

sub wanted {
  my (undef, $msg_id, $time, $data_ref) = @_;

  if ($opt_l && $messagecount++ > $opt_l) { die 'HITLIMIT'; }

  # Make sure message can be used for outputing mbox format
  if ($data_ref->[0] !~ /^From \S+ +... ... /) {
    unshift(@$data_ref, "From abc\@xyz.com Mon Jan  1 00:00:00 2000\n");
  }

  $bucket_fhs[$current_bucket]->print( join("", @$data_ref) );

  $current_bucket = ($current_bucket + 1) % $num_buckets;
} # wanted()