The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Treex::Block::W2A::EN::TagLinguaEn;
$Treex::Block::W2A::EN::TagLinguaEn::VERSION = '0.13095';
use strict;
use warnings;
use Moose;
use Treex::Core::Common;
extends 'Treex::Core::Block';
use Lingua::EN::Tagger;
has _tagger => (
    is            => 'ro',
    isa           => 'Lingua::EN::Tagger',
    builder       => '_build_tagger',
    lazy          => 1,
    init_arg      => undef,
    predicate     => '_tagger_built',
    documentation => q{Tagger object},
);

has _form_corrections => (
    is      => 'ro',
    isa     => 'HashRef[Str]',
    default => sub {
        {
            q(``) => q("),
            q('') => q("),
        }
    },
    documentation => q{Possible changes in forms done by tagger},
);

sub _build_tagger {
    my $self   = shift;
    my $tagger = Lingua::EN::Tagger->new();
    return $tagger;
}

sub _revert_form {    #because Lingua::EN::Tagger changes some forms to another, we need to restore original
    my $self = shift;
    my %args = @_;
    my $new  = $args{new};
    return $self->_form_corrections->{$new} // $new;
}


# Substitutions according to http://search.cpan.org/src/ACOBURN/Lingua-EN-Tagger-0.13/README
# The PennTB tagset is described e.g. at http://www.computing.dcu.ie/~acahill/tagset.html
sub _correct_lingua_tag {
    my ( $self, $linguatag, $wordform ) = @_;
    $linguatag = uc $linguatag; # newer versions of Lingua::EN::Tagger use lowercased tags
    return 'DT' if $linguatag eq 'DET';
    return 'PRP$' if $linguatag eq 'PRPS';
    return "-$linguatag-" if $linguatag =~ /^[LR]RB$/;
    if ($linguatag =~ /^PP/) {       # allowed "tags" of punctuation mark in PennTB: #  $ '' ( ) , . : ``
        return $wordform if $wordform =~ /^(?:#|$|''|,|\.|:|``)$/;
        return '-LRB-' if $wordform =~ /[\(\[\{]/;
        return '-RRB-' if $wordform =~ /[\)\]\}]/;
        return '.';
    }
    return $linguatag;
}

sub process_zone {
    my ( $self, $zone ) = @_;

    # get the source sentence
    my $sentence = $zone->sentence;
    log_fatal("No sentence in zone") if !defined $sentence;
    log_fatal(qq{There's already atree in zone}) if $zone->has_atree();
    log_debug("Processing sentence: $sentence");

    #split on whitespace, tags nor tokens doesn't contain spaces
    my @tagged = split /\s+/, $self->_tagger->add_tags($sentence);

    # create a-tree
    my $a_root    = $zone->create_atree();
    my $tag_regex = qr{
        <(\w+)> #<tag>
        ([^<]+) #form
        </\1>   #</tag>
        }x;
    my $space_start = qr{^\s+};
    my $ord         = 1;
    foreach my $tag_pair (@tagged) {
        if ( $tag_pair =~ $tag_regex ) {
            my $form = $self->_revert_form( new => $2 );
            my $tag = $self->_correct_lingua_tag( $1, $form );
            if ( $sentence =~ s/^\Q$form\E// ) {

                #check if there is space after word
                my $no_space_after = $sentence =~ m/$space_start/ ? 0 : 1;
                if ( $sentence eq q{} ) {
                    $no_space_after = 0;
                }

                #delete it
                $sentence =~ s{$space_start}{};

                # and create node under root
                $a_root->create_child(
                    form           => $form,
                    tag            => $tag,
                    no_space_after => $no_space_after,
                    ord            => $ord++,
                );
            }
            else {
                log_fatal("Mismatch between tagged word and original sentence: Tagged: $form. Original: $sentence.");
            }
        }
        else {
            log_fatal("Incorrect output format from Lingua::EN::Tagger: $tag_pair");
        }

    }
    return 1;
}

1;

__END__

=pod

=encoding utf-8

=head1 NAME

Treex::Block::W2A::EN::TagLinguaEn

=head1 VERSION

version 0.13095

=head1 DESCRIPTION

Each node in analytical tree is tagged using C<Lingua::EN::Tagger> (Penn Treebank POS tags).
Because Lingua::EN::Tagger does its own tokenization, it checks if tokenization is same.

=head1 AUTHORS

Tomáš Kraut <kraut@ufal.mff.cuni.cz>

=head1 COPYRIGHT AND LICENSE

Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague

This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.