File: gen_modules.plx

package info (click to toggle)
liblingua-stopwords-perl 0.09-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 156 kB
  • sloc: perl: 853; makefile: 2
file content (105 lines) | stat: -rwxr-xr-x 2,536 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/perl

# use to automatically generate the Lingua::StopWords::XX modules.
use lib qw( lib );
use strict;
use Lingua::StopWords;
use Lingua::Stem::Snowball qw( stemmers );
use Encode qw( from_to );
use Text::Wrap qw( wrap );
use Getopt::Long;

# tabs are evil
$Text::Wrap::unexpand = 0;

# snowdir should be the snowball_all directory
my $snowdir;
GetOptions( 'snowdir=s' => \$snowdir );
die "Usage ./bin/gen_modules.plx --snowdir=SNOWDIR"
    unless -d $snowdir;


my $template = <<'END_MODULE';
package Lingua::StopWords::#ISO#;

use strict;
use warnings;

use Exporter;
our @ISA = qw(Exporter);

our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); 
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our $VERSION = #VERSION#;

sub getStopWords {
    if ( @_ and $_[0] eq 'UTF-8' ) {
        # adding U0 causes the result to be flagged as UTF-8
        my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( 
#UTF8# 
        );
        return \%stoplist;
    }
    else {
        my %stoplist = map { ( $_, 1 ) } qw( 
#PLAIN# 
        );
        return \%stoplist;
    }
}

1;
END_MODULE

my %languages = ( 
    DA => "danish",
    NL => "dutch",
    EN => "english",
    FI => "finnish",
    FR => "french",
    DE => "german",
    HU => "hungarian",
    IT => "italian",
    NO => "norwegian",
    PT => "portuguese",
    RU => "russian",
    ES => "spanish",
    SV => "swedish",
);

while ( my ( $iso, $lang ) = each %languages ) {
    my $file = "$snowdir/algorithms/$lang/stop.txt";
        print STDERR "Generating '$lang' stopword list module\n";

    # extract stoplists from snowball source files; parse
    my @words;
    open( SNOWBALL_STOPFILE, "<", $file ) 
        or die "Couldn't open file '$file': $!";
    while (<SNOWBALL_STOPFILE>) {
        s/\|.*//g;
        next unless length;
        my @these_words = split;
        s/\s*// for @these_words;
        push @words, @these_words;
    }

    # translate to UTF-8 
    my $plain = join(' ', @words);
    $plain = wrap('            ', '            ', @words);
    my $source_enc = $lang eq 'ru' ? 'koi8-r' : 'iso-8859-1';
    from_to($_, $source_enc, 'UTF-8') for @words;
    my $utf8 = join(' ', @words);
    $utf8 = wrap('            ', '            ', @words);

    # sub in the lists
    my $mod = $template;
    $mod =~ s/#VERSION#/$Lingua::StopWords::VERSION/g;
    $mod =~ s/#ISO#/$iso/g;
    $mod =~ s/#PLAIN#/$plain/g;
    $mod =~ s/#UTF8#/$utf8/g;

    # blast it out
    open(F, ">lib/Lingua/StopWords/$iso.pm");
    print F $mod;
    close(F);
}