The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Treex::Block::W2A::TokenizeOnWhitespace;
BEGIN {
  $Treex::Block::W2A::TokenizeOnWhitespace::VERSION = '0.08170';
}
use Moose;
use Treex::Core::Common;
extends 'Treex::Core::Block';

sub tokenize_sentence {
    my ( $self, $sentence ) = @_;
    return $sentence;
}

sub process_zone {
    my ( $self, $zone ) = @_;

    # create a-tree
    my $a_root = $zone->create_atree();

    # get the source sentence and tokenize
    my $sentence = $zone->sentence;
    $sentence =~ s/^\s+//;
    log_fatal("No sentence to tokenize!") if !defined $sentence;
    my @tokens = split( /\s/, $self->tokenize_sentence($sentence) );

    foreach my $i ( ( 0 .. $#tokens ) ) {
        my $token = $tokens[$i];

        # delete the token from the begining of the sentence
        $sentence =~ s/^\Q$token\E//;

        # if there are no spaces left, the parameter no_space_after will be set to 1
        my $no_space_after = $sentence =~ /^\s/ ? 0 : 1;

        # delete this spaces
        $sentence =~ s/^\s+//;

        # create new a-node
        $a_root->create_child(
            form           => $token,
            no_space_after => $no_space_after,
            ord            => $i + 1,
        );
    }
    return 1;
}

1;

__END__

=encoding utf-8

=head1 NAME

Treex::Block::W2A::TokenizeOnWhitespace - Base tokenizer, splits on whitespaces, fills no_space_after

=head1 VERSION

version 0.08170

=head1 DESCRIPTION

Each sentence is split into a sequence of tokens.
Flat a-tree is built and attributes C<no_space_after> are filled.
This class tokenizes only on whitespaces,
but it can be used as an ancestor for more apropriate tokenization
by overriding the method C<tokenize_sentence>.

=head1 METHODS

=over 4

=item tokenize_sentence()

this method can be overridden in more advanced tokenizers

=item process_zone()

this method does all work of this tokenizer

=back

=head1 AUTHOR

David Mareček <marecek@ufal.mff.cuni.cz>

Martin Popel <popel@ufal.mff.cuni.cz>

=head1 COPYRIGHT AND LICENSE

Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague

This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.