File: tmx-POStagger

package info (click to toggle)
libxml-tmx-perl 0.39-2
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 304 kB
sloc: perl: 1,712; xml: 115; makefile: 2
file content (227 lines) | stat: -rw-r--r-- 5,518 bytes
parent folder | download | duplicates (2)
#!/usr/bin/perl -s

# PODNAME: tmx-POStagger
# ABSTRACT: POStaggers translation units on a tmx file.

use strict;
use warnings;
use File::Temp qw/ :POSIX /;

my $tmpName = tmpnam();

our (
     $o,       # output filename
     $mwe,     # tag mwe  -- STILL NOT SUPPORTED
     $dates,   # do dates analysis...
     $ner,     # try to do NER....
     $compact, # compact form
     $s,       # use <s> tags (on by default)
    );

$s = 1 unless defined $s;

eval { require FL3 };
die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@;
FL3->import();

eval { require Lingua::FreeLing3::Word };
die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@;
Lingua::FreeLing3::Word->import();

my %initted = ();

use XML::TMX::Reader '0.25';

my $file = shift or die "You must supply the name of the file to tokenize";

my $reader = XML::TMX::Reader->new($file);

my $output = "pos_$file";
$output = $o if $o;

my $pos_languages = {};

binmode STDOUT, ":utf8";
$reader->for_tu(
  {
   -output => $tmpName,
   -verbose => 1,
   -verbatim => 1,
  },
  sub {
      my $tu = shift;
      for my $lang (keys %$tu) {
          if ($lang =~ /(pt|es|gl|it|fr|ru|en)/i) {
              $pos_languages->{$1} = 1;

              my $ln = lc $1;
              my $seg = $compact ? "" : "<![CDATA[";

              _init_ma($ln) unless exists $initted{$ln};

              my $txt = $tu->{$lang}{-seg};
              my @tokens = map { Lingua::FreeLing3::Word->new($_) } split /\s+/, $txt;
              my $sentences = splitter($ln)->split(\@tokens);
              $sentences = morph($ln)->analyze($sentences);
              $sentences = hmm($ln)->tag($sentences);
              for my $stc (@$sentences) {
                  $seg .= "<s>\n" if $s;
                  $seg .= $compact ? _dump_compact($stc->words)
                                   : _dump_words($stc->words);
                  $seg .= "</s>\n" if $s;
              }
              $tu->{$lang}{-iscdata} = $compact ? 0 : 1;
              $seg .= "]]>" unless $compact;
              $tu->{$lang}{-seg} = $seg;
          }
      }
      return $tu;
   });

$reader = XML::TMX::Reader->new($tmpName);

$reader->for_tu({
	-output => $output,
	-verbatim => 1,
	-raw => 1,
	-prop => ($compact ? {} : {
                           'pos-tagged' => join(",",keys %$pos_languages),
                           'pos-fields' => "word,lemma,pos",
                           'pos-s-attributes' => ['s'],
                          })},
  sub { 
      $_ = $_[0];
      if (!/\[CDATA\[/) {
        s{(<s>)(.*?)(</s>)}
          {my ($before,$middle,$after) = ($1,$2,$3);
           for ($middle) {
              s/&/&amp;/g;s/</&lt;/g;s/>/&gt;/g;
           };
           $before.$middle.$after
          }ge;
      }
      return $_; });

#print STDERR "$tmpName\n";
unlink $tmpName if -e $tmpName;

sub _init_ma {
    my $lang = shift;

    morph($lang,
          QuantitiesDetection => 0,
          MultiwordsDetection => ($mwe // 0),
          NumbersDetection => 0,
          DatesDetection => ($dates // 0),
          NERecognition => ($ner // 0));

    $initted{$lang}++;
}

sub _dump_words {
    my @words = @_;
    my $seg = "";
    for my $t (@words) {
        if ($t->is_multiword) {
            $seg .= sprintf("<mwe lema=\"%s\" pos=\"%s\">\n", $t->lemma(), $t->tag());
            $seg .= _dump_words($t->get_mw_words);
            $seg .= "</mwe>\n";
        } else {
            $seg .= $t->form() ."\t". $t->lemma() ."\t". $t->tag() ."\n";
        }
    }
    return $seg;
}

sub _dump_compact {
	my $seg = join(" " => map {
		my $t = $_;
		($t->is_multiword ? join("_", map { $_->lemma() || $_->lc_form() } $t->get_mw_words)
		                  : ($t->lemma() || $t->lc_form()))
		. "/" . substr($t->tag(), 0, 1);
	} @_) . "\n";
	$seg =~ s/&/&amp;/g;
	$seg =~ s/</&lt;/g;
	$seg =~ s/>/&gt;/g;
	return $seg;
}

__END__

=pod

=encoding UTF-8

=head1 NAME

tmx-POStagger - POStaggers translation units on a tmx file.

=head1 VERSION

version 0.39

=head1 SYNOPSIS

   tmx-POStagger file.tmx      # creates pos_file.tmx

   tmx-POStagger -o=out.tmx file.tmx
        -ner    ... tags multiword named entities
        -dates
        -compact

=head1 DESCRIPTION

Although this script is bundled in C<XML::TMX>, it has a soft
dependency on C<Lingua::FreeLing3>. Soft means that the dependency is
not ensured at install time, and other features of the module can
still be used without C<Lingua::FreeLing3>. Nevertheless, if you want
to use this tool you should install that module.

At the moment the supported languages are the same as supported by
FreeLing3: English, Spanish, Russian, Portuguese and Italian.

It your TMX file includes any other language, they will be maintained
without a change.  This behavior can change in the future, as a basic
regexp based POStaggerr might be implemented.

=head2 Options

-ner   -- groups Proper names with tag C<mwe>

   which     which    WDT
   includes  include  VBZ
   <mwe lema="edward_cole" pos="NP00000">
   Edward          
   Cole
   </mwe>

-compact

-dates

=head1 SEE ALSO

XML::TMX, Lingua::FreeLing3

=head1 AUTHORS

=over 4

=item *

Alberto Simões <ambs@cpan.org>

=item *

José João Almeida <jj@di.uminho.pt>

=back

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2010-2017 by Projeto Natura <natura@di.uminho.pt>.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut