File: 2-utf8.t

package info (click to toggle)
liblingua-stopwords-perl 0.12-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 196 kB
  • sloc: perl: 778; makefile: 7
file content (60 lines) | stat: -rw-r--r-- 1,547 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
use strict;
use Test::More;

use utf8;

use lib qw(../lib/);

use Test::More;
use Test::More::UTF8;

BEGIN {
    eval "use Encode qw( is_utf8 encode);";
    if ($@) {
        plan skip_all => "Encode module not available";
    }
    use_ok('Lingua::StopWords');
}

my $tests = [
    [qw(da når  iso-8859-1)],
    [qw(de für  iso-8859-1)],
    [qw(en our  iso-8859-1)],
    [qw(es qué  iso-8859-1)],
    [qw(fi minä iso-8859-1)],
    [qw(fr été  iso-8859-1)],
    [qw(hu elő  iso-8859-2)],
    [qw(it più  iso-8859-1)],
    [qw(id dia  iso-8859-1)],
    [qw(nl hij  iso-8859-1)],
    [qw(no på   iso-8859-1)],
    [qw(ro ăla  iso-8859-2)],
    [qw(pt não  iso-8859-1)],
    [qw(ru все  KOI8-R)],
    [qw(sv för  iso-8859-1)],
];


for my $test ( @{$tests} ) {
    my ($language, $word, $encoding) = @$test;
    my $stoplist = Lingua::StopWords::getStopWords( $language, 'UTF-8' );
    ok( $stoplist->{$word}, "UTF-8 encoded version present in stoplist [$language]" );

    for my $key ( keys %$stoplist ) {
        $key .= 'å'; # upgrades pure ASCII to UTF-8
        $key =~ s/å$//;
        ok( is_utf8($key), "the stoplist keys of [$language] are flagged as UTF-8" );
        last;
    }

    $stoplist = Lingua::StopWords::getStopWords($language);
    my $octet = encode($encoding, $word);
    ok( $stoplist->{$octet}, "Non-utf8-flagged version of [$language] present" );

    for ( keys %$stoplist ) {
        ok( !is_utf8($_), "the stoplist keys of [$language] are not flagged as UTF-8" );
        last;
    }
}

done_testing;