The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Treex::Block::W2A::TokenizeOnWhitespace;
$Treex::Block::W2A::TokenizeOnWhitespace::VERSION = '2.20151102';
use strict;
use warnings;
use Moose;
use utf8;
use Treex::Core::Common;
extends 'Treex::Core::Block';

sub tokenize_sentence {
    my ( $self, $sentence ) = @_;
    return $sentence;
}

sub process_zone {
    my ( $self, $zone ) = @_;

    # create a-tree
    my $a_root = $zone->create_atree();

    # get the source sentence and tokenize
    my $sentence = $zone->sentence;
    $sentence =~ s/^\s+//;
    log_fatal("No sentence to tokenize!") if !defined $sentence;
    my @tokens = split( /\s/, $self->tokenize_sentence($sentence) );

    # Create a-nodes and detect the no_space_after attribute.
    foreach my $i ( ( 0 .. $#tokens ) ) {
        my $token = $tokens[$i];

        # By default, there is a space after each token.
        my $no_space_after = 0;

        # Delete the token from the begining of the sentence.
        if ($sentence =~ s/^\Q$token\E//){
            # This is the expected case. The sentence starts with the $token.
            # If it is followed by a space, delete the space and set no_space_after=1.
            $no_space_after = 1 if length($sentence) && $sentence !~ s/^\s+//;
        } elsif ($i < $#tokens) {
            # The token (returned from tokenization) does not match the start of sentence.
            # E.g. '. . . word' is tokenized as  '... word'.
            # Let's delete the start of sentence anyway,
            # using a non-greedy regex and the expected next token returned from the tokenization.
            my $next_token = $tokens[$i+1];
            my ($first, $rest) = ($sentence =~ /^(.*?)(\Q$next_token\E.*)$/);
            $no_space_after = 1 if $first !~ /\s$/;
            $sentence = $rest;
        }

        # create new a-node
        $a_root->create_child(
            form           => $token,
            no_space_after => $no_space_after,
            ord            => $i + 1,
        );
    }
    return 1;
}

1;

__END__

=encoding utf-8

=head1 NAME

Treex::Block::W2A::TokenizeOnWhitespace - Base tokenizer, splits on whitespaces, fills no_space_after

=head1 VERSION

version 2.20151102

=head1 DESCRIPTION

Each sentence is split into a sequence of tokens.
Flat a-tree is built and attributes C<no_space_after> are filled.
This class tokenizes only on whitespaces,
but it can be used as an ancestor for more apropriate tokenization
by overriding the method C<tokenize_sentence>.

=head1 METHODS

=over 4

=item tokenize_sentence()

this method can be overridden in more advanced tokenizers

=item process_zone()

this method does all work of this tokenizer

=back

=head1 AUTHOR

David Mareček <marecek@ufal.mff.cuni.cz>

Martin Popel <popel@ufal.mff.cuni.cz>

=head1 COPYRIGHT AND LICENSE

Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague

This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.