#!/usr/bin/perl -s
use strict;
use warnings;
use File::Temp qw/ :POSIX /;
my $tmpName = tmpnam();
our (
$o, # output filename
$mwe, # tag mwe -- STILL NOT SUPPORTED
$dates, # do dates analysis...
$ner, # try to do NER....
$compact, # compact form
$s, # use <s> tags (on by default)
);
$s = 1 unless defined $s;
eval { require FL3 };
die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@;
FL3->import();
eval { require Lingua::FreeLing3::Word };
die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@;
Lingua::FreeLing3::Word->import();
my %initted = ();
use XML::TMX::Reader '0.25';
my $file = shift or die "You must supply the name of the file to tokenize";
my $reader = XML::TMX::Reader->new($file);
my $output = "pos_$file";
$output = $o if $o;
my $pos_languages = {};
binmode STDOUT, ":utf8";
$reader->for_tu(
{
-output => $tmpName,
-verbose => 1,
-verbatim => 1,
},
sub {
my $tu = shift;
for my $lang (keys %$tu) {
if ($lang =~ /(pt|es|gl|it|fr|ru|en)/i) {
$pos_languages->{$1} = 1;
my $ln = lc $1;
my $seg = $compact ? "" : "<![CDATA[";
_init_ma($ln) unless exists $initted{$ln};
my $txt = $tu->{$lang}{-seg};
my @tokens = map { Lingua::FreeLing3::Word->new($_) } split /\s+/, $txt;
my $sentences = splitter($ln)->split(\@tokens);
$sentences = morph($ln)->analyze($sentences);
$sentences = hmm($ln)->tag($sentences);
for my $stc (@$sentences) {
$seg .= "<s>\n" if $s;
$seg .= $compact ? _dump_compact($stc->words)
: _dump_words($stc->words);
$seg .= "</s>\n" if $s;
}
$tu->{$lang}{-iscdata} = $compact ? 0 : 1;
$seg .= "]]>" unless $compact;
$tu->{$lang}{-seg} = $seg;
}
}
return $tu;
});
$reader = XML::TMX::Reader->new($tmpName);
$reader->for_tu({
-output => $output,
-verbatim => 1,
-raw => 1,
-prop => ($compact ? {} : {
'pos-tagged' => join(",",keys %$pos_languages),
'pos-fields' => "word,lemma,pos",
'pos-s-attributes' => ['s'],
})},
sub {
$_ = $_[0];
if (!/\[CDATA\[/) {
s{(<s>)(.*?)(</s>)}
{my ($before,$middle,$after) = ($1,$2,$3);
for ($middle) {
s/&/&/g;s/</</g;s/>/>/g;
};
$before.$middle.$after
}ge;
}
return $_; });
#print STDERR "$tmpName\n";
unlink $tmpName if -e $tmpName;
sub _init_ma {
my $lang = shift;
morph($lang,
QuantitiesDetection => 0,
MultiwordsDetection => ($mwe // 0),
NumbersDetection => 0,
DatesDetection => ($dates // 0),
NERecognition => ($ner // 0));
$initted{$lang}++;
}
sub _dump_words {
my @words = @_;
my $seg = "";
for my $t (@words) {
if ($t->is_multiword) {
$seg .= sprintf("<mwe lema=\"%s\" pos=\"%s\">\n", $t->lemma(), $t->tag());
$seg .= _dump_words($t->get_mw_words);
$seg .= "</mwe>\n";
} else {
$seg .= $t->form() ."\t". $t->lemma() ."\t". $t->tag() ."\n";
}
}
return $seg;
}
sub _dump_compact {
my $seg = join(" " => map {
my $t = $_;
($t->is_multiword ? join("_", map { $_->lemma() || $_->lc_form() } $t->get_mw_words)
: ($t->lemma() || $t->lc_form()))
. "/" . substr($t->tag(), 0, 1);
} @_) . "\n";
$seg =~ s/&/&/g;
$seg =~ s/</</g;
$seg =~ s/>/>/g;
return $seg;
}
=encoding UTF-8
=head1 NAME
tmx-POStagger - POStaggers translation units on a tmx file.
=head1 SYNOPSIS
tmx-POStagger file.tmx # creates pos_file.tmx
tmx-POStagger -o=out.tmx file.tmx
-ner ... tags multiword named entities
-dates
-compact
=head1 DESCRIPTION
Although this script is bundled in C<XML::TMX>, it has a soft
dependency on C<Lingua::FreeLing3>. Soft means that the dependency is
not ensured at install time, and other features of the module can
still be used without C<Lingua::FreeLing3>. Nevertheless, if you want
to use this tool you should install that module.
At the moment the supported languages are the same as supported by
FreeLing3: English, Spanish, Russian, Portuguese and Italian.
It your TMX file includes any other language, they will be maintained
without a change. This behavior can change in the future, as a basic
regexp based POStaggerr might be implemented.
=head2 Options
-ner -- groups Proper names with tag C<mwe>
which which WDT
includes include VBZ
<mwe lema="edward_cole" pos="NP00000">
Edward
Cole
</mwe>
-compact
-dates
=head1 SEE ALSO
XML::TMX, Lingua::FreeLing3
=head1 AUTHOR
Alberto Manuel Brandão Simões, E<lt>ambs@cpan.orgE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2012-2014 by Alberto Manuel Brandão Simões
=cut