The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#-*-perl-*-

package Uplug::PreProcess::Tokenizer;

=head1 NAME

Uplug::PreProcess::Tokenizer

=head1 SYNOPSIS

 my $tokenizer = new Uplug::PreProcess::Tokenizer( lang => 'en' );
 my @tokens = tokenizer->tokenize( 'Mr. Smith says: "What is a text anyway?"' );
 my $text = detokenize( '" Big improvement ! " says Mr. Smith .');

=head1 IMPLEMENTS

=head2 C<tokenize>

Tokenize a given text. Returns a list of tokens.

=head2 C<detokenize>

De-tokenize a space-separated text or a list of tokens. Returns plain text.

=head2 C<load_prefixes>

Load language specific abbreviations and other non-breaking prefixes.

=head1 DESCRIPTION

This module heavily relies on the implementation of the tokenizer and detokenizer used in the Moses toolkit for SMT. All credits go to the original authors (Josh Schroeder and Philipp Koehn).

=cut

use strict;


use Uplug::Config;

# defaults: language = English

our $DEFAULT_LANG = 'en';

our %NONBREAKING_PREFIX;
our $NONBREAKING_PREFIX_DIR     = &shared_lang() . '/nonbreaking_prefixes';
our $DEFAULT_NONBREAKING_PREFIX = $NONBREAKING_PREFIX_DIR . '/nonbreaking_prefix.en';


sub new {
    my $class = shift;
    my %self  = @_;

    if ( $self{lang} ) {
        load_prefixes( $self{lang}, \%NONBREAKING_PREFIX );
    }

    return bless \%self, $class;
}


sub tokenize {
    my $self = shift;

    my ($text) = @_;
    chomp($text);
    $text = " $text ";

    # seperate out all "other" special characters
    $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; #`

    #multi-dots stay together
    $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
    while ( $text =~ /DOTMULTI\./ ) {
        $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
        $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
    }

    # seperate out "," except if within numbers (5,300)
    $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;

    # separate , pre and post number
    $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
    $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;

    # turn `into '
    $text =~ s/\`/\'/g; #`

    #turn '' into "
    $text =~ s/\'\'/ \" /g;

    if ( $$self{lang} eq "en" ) {
        #split contractions right
        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
        $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;

        #special case for "1990's"
        $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
    }
    elsif ( ( $$self{lang} eq "fr" ) or ( $$self{lang} eq "it" ) ) {
        #split contractions left
        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
        $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
    }
    else {
        $text =~ s/\'/ \' /g;
    }

    #word token method
    my @words = split( /\s/, $text );
    $text = "";
    for ( my $i = 0; $i < ( scalar(@words) ); $i++ ) {
        my $word = $words[$i];
        if ( $word =~ /^(\S+)\.$/ ) {
            my $pre = $1;
            if (( $pre =~ /\./ && $pre =~ /\p{IsAlpha}/ )
                || (   $NONBREAKING_PREFIX{$pre}
                    && $NONBREAKING_PREFIX{$pre} == 1 )
                || ( $i < scalar(@words) - 1
                    && ( $words[ $i + 1 ] =~ /^[\p{IsLower}]/ ) ) )
            {
                #no change
            }
            elsif (
                (      $NONBREAKING_PREFIX{$pre}
                    && $NONBREAKING_PREFIX{$pre} == 2
                )
                && ( $i < scalar(@words) - 1
                    && ( $words[ $i + 1 ] =~ /^[0-9]+/ ) ) )
            {
                #no change
            }
            else {
                $word = $pre . " .";
            }
        }
        $text .= $word . " ";
    }

    # clean up extraneous spaces
    $text =~ s/ +/ /g;
    $text =~ s/^ //g;
    $text =~ s/ $//g;

    #restore multi-dots
    while ( $text =~ /DOTDOTMULTI/ ) {
        $text =~ s/DOTDOTMULTI/DOTMULTI./g;
    }
    $text =~ s/DOTMULTI/./g;

    #ensure final line break
    $text .= "\n" unless $text =~ /\n$/;
    return split( /\s+/, $text );
}


sub load_prefixes {
    my ( $language, $PREFIX_REF ) = @_;

    my $prefixfile
        = $NONBREAKING_PREFIX_DIR . '/nonbreaking_prefix.' . $language;

    #default back to English if we don't have a language-specific prefix file
    if ( !( -e $prefixfile ) ) {
        $prefixfile = $DEFAULT_NONBREAKING_PREFIX;
        unless ( -e $prefixfile ) {
            return 0;
        }
    }

    if ( -e "$prefixfile" ) {
        open( PREFIX, "<:encoding(utf8)", "$prefixfile" );
        while (<PREFIX>) {
            my $item = $_;
            chomp($item);
            if ( ($item) && ( substr( $item, 0, 1 ) ne "#" ) ) {
                if ( $item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/ ) {
                    $PREFIX_REF->{$1} = 2;
                }
                else {
                    $PREFIX_REF->{$item} = 1;
                }
            }
        }
        close(PREFIX);
    }
}


# this is the Moses detokenizer
# written by Josh Schroeder, based on code by Philipp Koehn

sub detokenize {
    my $self  = shift;
    my $token = shift;

    my $text = ref($token) eq 'ARRAY' ? join( ' ', @$token ) : $token;
    my $language = $self->{lang} || 'en';

    #    chomp($text);
    $text =~ s/\n/ /gs;
    $text =~ s/\s\s+/ /gs;
    $text = " $text ";

    my $word;
    my $i;
    my @words = split( / /, $text );
    $text = "";
    my %quoteCount = ( "\'" => 0, "\"" => 0 );
    my $prependSpace = " ";
    for ( $i = 0; $i < ( scalar(@words) ); $i++ ) {
        if ( $words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/ ) {
            # perform right shift on currency and other random punctuation items
            $text         = $text . $prependSpace . $words[$i];
            $prependSpace = "";
        }
        elsif ( $words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/ ) {
            # perform left shift on punctuation items
            $text         = $text . $words[$i];
            $prependSpace = " ";
        }
        elsif (( $language eq "en" )
            && ( $i > 0 )
            && ( $words[$i] =~ /^[\'][\p{IsAlpha}]/ )
            && ( $words[ $i - 1 ] =~ /[\p{IsAlnum}]$/ ) )
        {
            # left-shift the contraction for English
            $text         = $text . $words[$i];
            $prependSpace = " ";
        }
        elsif (( $language eq "fr" )
            && ( $i < ( scalar(@words) - 2 ) )
            && ( $words[$i] =~ /[\p{IsAlpha}][\']$/ )
            && ( $words[ $i + 1 ] =~ /^[\p{IsAlpha}]/ ) )
        {
            # right-shift the contraction for French
            $text         = $text . $prependSpace . $words[$i];
            $prependSpace = "";
        }
        elsif ( $words[$i] =~ /^[\'\"]+$/ ) {
            # combine punctuation smartly
            if ( ( $quoteCount{ $words[$i] } % 2 ) eq 0 ) {
                if (   ( $language eq "en" )
                    && ( $words[$i] eq "'" )
                    && ( $i > 0 )
                    && ( $words[ $i - 1 ] =~ /[s]$/ ) )
                {
                   # single quote for posesssives ending in s... "The Jones' house"
                   # left shift
                    $text         = $text . $words[$i];
                    $prependSpace = " ";
                }
                else {
                    # right shift
                    $text         = $text . $prependSpace . $words[$i];
                    $prependSpace = "";
                    $quoteCount{ $words[$i] } = $quoteCount{ $words[$i] } + 1;
                }
            }
            else {
                # left shift
                $text                     = $text . $words[$i];
                $prependSpace             = " ";
                $quoteCount{ $words[$i] } = $quoteCount{ $words[$i] } + 1;
            }
        }
        else {
            $text         = $text . $prependSpace . $words[$i];
            $prependSpace = " ";
        }
    }

    # clean up spaces at head and tail of each line as well as any double-spacing
    $text =~ s/ +/ /g;
    $text =~ s/\n /\n/g;
    $text =~ s/ \n/\n/g;
    $text =~ s/^ //g;
    $text =~ s/ $//g;

    # add trailing break
    #$text .= "\n" unless $text =~ /\n$/;

    return $text;
}


1;