File: tmx-POStagger

package info (click to toggle)
libxml-tmx-perl 0.39-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 304 kB
  • sloc: perl: 1,712; xml: 115; makefile: 2
file content (227 lines) | stat: -rw-r--r-- 5,518 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/usr/bin/perl -s

# PODNAME: tmx-POStagger
# ABSTRACT: POStaggers translation units on a tmx file.

use strict;
use warnings;
use File::Temp qw/ :POSIX /;

my $tmpName = tmpnam();

our (
     $o,       # output filename
     $mwe,     # tag mwe  -- STILL NOT SUPPORTED
     $dates,   # do dates analysis...
     $ner,     # try to do NER....
     $compact, # compact form
     $s,       # use <s> tags (on by default)
    );

$s = 1 unless defined $s;

eval { require FL3 };
die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@;
FL3->import();

eval { require Lingua::FreeLing3::Word };
die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@;
Lingua::FreeLing3::Word->import();

my %initted = ();

use XML::TMX::Reader '0.25';

my $file = shift or die "You must supply the name of the file to tokenize";

my $reader = XML::TMX::Reader->new($file);

my $output = "pos_$file";
$output = $o if $o;

my $pos_languages = {};

binmode STDOUT, ":utf8";
$reader->for_tu(
  {
   -output => $tmpName,
   -verbose => 1,
   -verbatim => 1,
  },
  sub {
      my $tu = shift;
      for my $lang (keys %$tu) {
          if ($lang =~ /(pt|es|gl|it|fr|ru|en)/i) {
              $pos_languages->{$1} = 1;

              my $ln = lc $1;
              my $seg = $compact ? "" : "<![CDATA[";

              _init_ma($ln) unless exists $initted{$ln};

              my $txt = $tu->{$lang}{-seg};
              my @tokens = map { Lingua::FreeLing3::Word->new($_) } split /\s+/, $txt;
              my $sentences = splitter($ln)->split(\@tokens);
              $sentences = morph($ln)->analyze($sentences);
              $sentences = hmm($ln)->tag($sentences);
              for my $stc (@$sentences) {
                  $seg .= "<s>\n" if $s;
                  $seg .= $compact ? _dump_compact($stc->words)
                                   : _dump_words($stc->words);
                  $seg .= "</s>\n" if $s;
              }
              $tu->{$lang}{-iscdata} = $compact ? 0 : 1;
              $seg .= "]]>" unless $compact;
              $tu->{$lang}{-seg} = $seg;
          }
      }
      return $tu;
   });

$reader = XML::TMX::Reader->new($tmpName);

$reader->for_tu({
	-output => $output,
	-verbatim => 1,
	-raw => 1,
	-prop => ($compact ? {} : {
                           'pos-tagged' => join(",",keys %$pos_languages),
                           'pos-fields' => "word,lemma,pos",
                           'pos-s-attributes' => ['s'],
                          })},
  sub { 
      $_ = $_[0];
      if (!/\[CDATA\[/) {
        s{(<s>)(.*?)(</s>)}
          {my ($before,$middle,$after) = ($1,$2,$3);
           for ($middle) {
              s/&/&amp;/g;s/</&lt;/g;s/>/&gt;/g;
           };
           $before.$middle.$after
          }ge;
      }
      return $_; });

#print STDERR "$tmpName\n";
unlink $tmpName if -e $tmpName;

sub _init_ma {
    my $lang = shift;

    morph($lang,
          QuantitiesDetection => 0,
          MultiwordsDetection => ($mwe // 0),
          NumbersDetection => 0,
          DatesDetection => ($dates // 0),
          NERecognition => ($ner // 0));

    $initted{$lang}++;
}

sub _dump_words {
    my @words = @_;
    my $seg = "";
    for my $t (@words) {
        if ($t->is_multiword) {
            $seg .= sprintf("<mwe lema=\"%s\" pos=\"%s\">\n", $t->lemma(), $t->tag());
            $seg .= _dump_words($t->get_mw_words);
            $seg .= "</mwe>\n";
        } else {
            $seg .= $t->form() ."\t". $t->lemma() ."\t". $t->tag() ."\n";
        }
    }
    return $seg;
}

sub _dump_compact {
	my $seg = join(" " => map {
		my $t = $_;
		($t->is_multiword ? join("_", map { $_->lemma() || $_->lc_form() } $t->get_mw_words)
		                  : ($t->lemma() || $t->lc_form()))
		. "/" . substr($t->tag(), 0, 1);
	} @_) . "\n";
	$seg =~ s/&/&amp;/g;
	$seg =~ s/</&lt;/g;
	$seg =~ s/>/&gt;/g;
	return $seg;
}

__END__

=pod

=encoding UTF-8

=head1 NAME

tmx-POStagger - POStaggers translation units on a tmx file.

=head1 VERSION

version 0.39

=head1 SYNOPSIS

   tmx-POStagger file.tmx      # creates pos_file.tmx

   tmx-POStagger -o=out.tmx file.tmx
        -ner    ... tags multiword named entities
        -dates
        -compact

=head1 DESCRIPTION

Although this script is bundled in C<XML::TMX>, it has a soft
dependency on C<Lingua::FreeLing3>. Soft means that the dependency is
not ensured at install time, and other features of the module can
still be used without C<Lingua::FreeLing3>. Nevertheless, if you want
to use this tool you should install that module.

At the moment the supported languages are the same as supported by
FreeLing3: English, Spanish, Russian, Portuguese and Italian.

It your TMX file includes any other language, they will be maintained
without a change.  This behavior can change in the future, as a basic
regexp based POStaggerr might be implemented.

=head2 Options

-ner   -- groups Proper names with tag C<mwe>

   which     which    WDT
   includes  include  VBZ
   <mwe lema="edward_cole" pos="NP00000">
   Edward          
   Cole
   </mwe>

-compact

-dates

=head1 SEE ALSO

XML::TMX, Lingua::FreeLing3

=head1 AUTHORS

=over 4

=item *

Alberto Simões <ambs@cpan.org>

=item *

José João Almeida <jj@di.uminho.pt>

=back

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2010-2017 by Projeto Natura <natura@di.uminho.pt>.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut