File: triplets.pl

package info (click to toggle)
spamassassin 3.1.7-2
  • links: PTS
  • area: main
  • in suites: etch-m68k
  • size: 5,376 kB
  • ctags: 2,123
  • sloc: perl: 39,706; ansic: 3,133; sh: 1,344; sql: 170; makefile: 168
file content (50 lines) | stat: -rwxr-xr-x 808 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/perl

# Procudes the rules/triplets.txt file used by
# Eval::check_for_unique_subject_id()

use strict;

my $FRAG_LEN = 3;

my %fragments = ();
my $word_num = 0;

if (@ARGV == 0) {
    print STDERR "Usage: triplets.pl dict_file1 [dict_file2 ...] > triplets.txt\n";
    exit(1);
}

while(<>) {
  chomp;

  $word_num++;

  my $word_len = length($_);

  # Ignore proper names
  next if ($_ =~ /[^a-z]/);

  next if ($word_len < $FRAG_LEN);

  if ($word_len == $FRAG_LEN) {
    $fragments{$_} = 1;
    next;
  }

  my $i;

  for ($i = 0; $i < ($word_len - $FRAG_LEN); $i++) {
    my $frag = substr $_, $i, $FRAG_LEN;
    $fragments{$frag} = 1;
  }

  if ($word_num % 1000 == 0) {
    print STDERR ".";
  }
}

print STDERR "\n\n$word_num words processed\n";


print join("\n", keys(%fragments)), "\n";