The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
use strict;
use Test::More tests => 25;
use Search::Tools::Tokenizer;
use Search::Tools::UTF8;
use Search::Tools::Snipper;
use Data::Dump qw( dump );

# http://code.google.com/p/test-more/issues/detail?id=46
binmode Test::More->builder->output,         ":utf8";
binmode Test::More->builder->failure_output, ":utf8";

# simple case
ok( my $tokenizer = Search::Tools::Tokenizer->new(), "new tokenizer" );
ok( my $tokens = $tokenizer->tokenize( 'I am a sentence.', qr/\w/ ),
    "tokenize" );
ok( $tokens->get_token(0)->is_sentence_start, "first token starts sentence" );
ok( $tokens->get_token( $tokens->num - 1 )->is_sentence_end,
    "last token ends sentence" );

#dump( $tokens->get_sentence_starts );

ok( $tokens
        = $tokenizer->tokenize( "r-o-c-k in the U.S.A. Mr. Smith!", qr/\w/ ),
    "parse abbrev"
);
ok( $tokens->get_token(11)->is_sentence_end, "smith is sentence ender" );

#dump( $tokens->get_sentence_starts );
#$tokens->dump;

# harder
ok( $tokens
        = $tokenizer->tokenize( qq/lo! how a rose 'ere bloometh/, qr/\w/ ),
    "tokenize rose"
);
ok( $tokens->get_token(0)->is_sentence_start,
    "first token starts sentence even though lowercase"
);
ok( $tokens->get_token(1)->is_sentence_end, "second token is sentence end" );

#dump( $tokens->get_sentence_starts );

# utf8 w/ punc start
ok( $tokens = $tokenizer->tokenize(
        to_utf8("¿Cómo estás? is Èste! ¿is super Èste? "), qr/\w/
    ),
    "tokenize spanish"
);
ok( $tokens->get_token(0)->is_sentence_start, "spanish ¿ starts sentence" );
TODO: {
    local $TODO = 'C is hard.';
    ok( $tokens->get_token(8)->is_sentence_start,
        "spanish ¿ starts sentence in middle of the string"
    );
}
ok( $tokens->get_token( $tokens->len - 1 )->is_sentence_end,
    "punctuation ends sentence" );

#dump( $tokens->get_sentence_starts );
#$tokens->dump;

ok( my $snipper = Search::Tools::Snipper->new(
        query        => 'foo',
        as_sentences => 1,
        max_chars    => 5,
    ),
    "new snipper"
);

ok( my $snip = $snipper->snip('Text with match near the foo'), "snip foo" );
is( $snip, 'Text ... ', "got snip" );

my $long_text = <<EOF;
This is a long section of text with foo. First, there is an intro sentence.
Second, there is an explanation about foo. Third, and finally for foo, 
there is a conclusion that ties it all together.
EOF

my $long_text_snip
    = qq/This is a long section of text with foo. ... Second, there is an explanation about foo. ... Third, and finally for foo, there is a conclusion that ties it all together./;

my $long_snipper = Search::Tools->snipper(
    query => 'foo',

    #debug         => 1,
    occur         => 3,      # number of snips
    context       => 100,    # number of words in each snip
    as_sentences  => 1,
    ignore_length => 1,      # ignore max_chars, return entire snippet.
    show          => 0,      # only show if match, no dumb substr
    treat_phrases_as_singles => 0,    # keep phrases together
);

ok( my $long_snip = $long_snipper->snip($long_text), "snip long text" );
is( $long_snipper->snip('foo'), 'foo', "easy optimization for single term" );
is( $long_snipper->snip('foo bar baz foo'),
    'foo bar baz foo',
    "slightly harder optimization"
);

#diag($long_snip);
is( $long_snip, $long_text_snip, "long text snip" );

#########
# straight up sentence detection
ok( my $sent_tokens = $tokenizer->tokenize($long_text),
    "tokenize long text" );
my $nstarts = 0;
while ( my $t = $sent_tokens->next ) {

    if ( $t->is_sentence_start ) {

        #print "\nSTART: ";
        $nstarts++;
    }

    #print "$t";

}
is( $nstarts, 4, "4 sentences detected" );

# as_sentences
ok( my $sentences = $sent_tokens->as_sentences, "as_sentences" );
for my $sentence (@$sentences) {

    #    diag( join( "", map {"$_"} @$sentence ) );
}
ok( $sentences = $sent_tokens->as_sentences(1), "as_sentences stringed" );
for my $sentence (@$sentences) {

    #    diag($sentence);
}

#dump($sentences);
is_deeply(
    $sentences,
    [   "This is a long section of text with foo.",
        "First, there is an intro sentence.",
        "Second, there is an explanation about foo.",
        "Third, and finally for foo, there is a conclusion that ties it all together.",
    ],
    "sentence strings"
);