The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Search::Tools::TokenListUtils;
use Moo::Role;
use Carp;

our $VERSION = '1.007';

=head1 NAME

Search::Tools::TokenListUtils - mixin methods for TokenList and TokenListPP

=head1 SYNOPSIS

 my $tokens = $tokenizer->tokenize( $string );
 if ( $tokens->str eq $string) {
    print "string is same, before and after tokenize()\n";
 }
 else {
    warn "I'm filing a bug report against Search::Tools right away!\n";
 }
 
 my ($start_pos, $end_pos) = $tokens->get_window( 5, 20 );
 # $start_pos probably == 0
 # $end_pos probably   == 25
 
 my $slice = $tokens->get_window_pos( 5, 20 );
 for my $token (@$slice) {
    print "token = $token\n";
 }

=head1 DESCRIPTION

Search::Tools::TokenListUtils contains pure-Perl methods inhertited
by both Search::Tools::TokenList and Search::Tools::TokenListPP.

=head1 METHODS

=head2 str

Returns a serialized version of the TokenList. If you haven't
altered the TokenList since you got it from tokenize(),
then str() returns a scalar string identical to (but not the same as)
the string you passed to tokenize().

Both Search::Tools::TokenList and TokenListPP are overloaded
to stringify to the str() value.

=cut

sub str {
    my $self   = shift;
    my $joiner = shift(@_);
    if ( !defined $joiner ) {
        $joiner = '';
    }
    return join( $joiner, map {"$_"} @{ $self->as_array } );
}

=head2 get_window( I<pos> [, I<size>, I<as_sentence>] )

Returns array with two values: I<start> and I<end> positions
for the array of length I<size> on either side of I<pos>. 
Like taking a slice of the TokenList.

Note that I<size> is the number of B<tokens> not B<matches>.
So if you're looking for the number of "words", think about
I<size>*2.

Note too that I<size> is the number of B<tokens> on B<one>
side of I<pos>. So the entire window width (length of the returned
slice) is I<size>*2 +/-1. The window is guaranteed to be bounded
by B<matches>.

If I<as_sentence> is true, the window is shifted to try and match
the first token prior to I<pos> that returns true for is_sentence_start().

=cut

sub get_window {
    my $self = shift;
    my $pos  = shift;
    if ( !defined $pos ) {
        croak "pos required";
    }

    my $size        = int(shift) || 20;
    my $as_sentence = shift      || 0;
    my $max_index   = $self->len - 1;

    if ( $pos > $max_index or $pos < 0 ) {
        croak "illegal pos value: no such index in TokenList";
    }

    #warn "window size $size for pos $pos";

    # get the $size tokens on either side of $tok
    my ( $start, $end );

    # is token too close to the top of the stack?
    if ( $pos > $size ) {
        $start = $pos - $size;
    }

    # is token too close to the bottom of the stack?
    if ( $pos < ( $max_index - $size ) ) {
        $end = $pos + $size;
    }
    $start ||= 0;
    $end   ||= $max_index;

    if ($as_sentence) {
        my $sentence_starts = $self->get_sentence_starts;

        # default to what we have.
        my $start_for_pos = $start;
        my $i             = 0;

        #warn "looking for sentence_start for start = $start end = $end\n";
        for (@$sentence_starts) {

            #warn " $_ [$i]\n";
            if ( $_ >= $pos ) {
                $start_for_pos = $sentence_starts->[$i];
                last;
            }
            $i++;
        }

        #warn "found $start_for_pos (start = $start end = $end)\n";
        if ( $start_for_pos != $start ) {
            if ( $start_for_pos < $start ) {
                $end -= ( $start - $start_for_pos );
            }
            else {
                $end += ( $start_for_pos - $start );
            }
            $start = $start_for_pos;
        }

        #warn "now $start_for_pos (start = $start end = $end)\n";
    }
    else {

        # make sure window starts and ends with is_match
        while ( !$self->get_token($start)->is_match ) {
            $start++;
        }
        while ( !$self->get_token($end)->is_match ) {
            $end--;
        }
    }

    #warn "return $start .. $end";
    #warn "$size ~~ " . ( $end - $start );

    return ( $start, $end );
}

=head2 get_window_tokens( I<pos> [, I<size>] )

Like get_window() but returns an array ref of a slice
of the TokenList containing Tokens.

=cut

sub get_window_tokens {
    my $self = shift;
    my ( $start, $end ) = $self->get_window(@_);
    my @slice = ();
    for ( $start .. $end ) {
        push( @slice, $self->get_token($_) );
    }
    return \@slice;
}

=head2 as_sentences([I<stringified>])

Returns a reference to an array of arrays,
where each child array is a "sentence" worth of Token objects.
You can stringify each sentence array like:

 my $sentences = $tokenlist->as_sentences;
 for my $s (@$sentences) {
     printf("sentence: %s\n", join("", map {"$_"} @$s));
 }

If you pass a single true value to as_sentences(),
then the array returned will consist of plain scalar strings
with whitespace normalized.

=cut

sub as_sentences {
    my $self = shift;
    my $stringed = shift || 0;
    my @sents;
    my @s;

    # use array method since we do not know the iterator position
    for my $t ( @{ $self->as_array } ) {
        if ( $t->is_sentence_start ) {

            # if has any, add anonymous copy to master
            if (@s) {
                push @sents, [@s];
            }

            # reset
            @s = ();
        }

        # add
        push @s, $t;
    }
    if (@s) {
        push @sents, [@s];
    }
    if ($stringed) {
        my @stringed;
        for my $s (@sents) {
            my $str = join( "", map {"$_"} @$s );
            $str =~ s/\s\s+/\ /g;
            $str =~ s/\s+$//;
            push @stringed, $str;
        }
        return \@stringed;
    }

    return \@sents;
}

1;

__END__

=head1 AUTHOR

Peter Karman C<< <karman@cpan.org> >>

=head1 BUGS

Please report any bugs or feature requests to C<bug-search-tools at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Search-Tools>.  
I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Search::Tools


You can also look for information at:

=over 4

=item * RT: CPAN's request tracker

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Search-Tools>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/Search-Tools>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/Search-Tools>

=item * Search CPAN

L<http://search.cpan.org/dist/Search-Tools/>

=back

=head1 COPYRIGHT

Copyright 2009 by Peter Karman.

This package is free software; you can redistribute it and/or modify it under the 
same terms as Perl itself.