The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w

=head1 NAME 

t/tokenizer.t - tests Plucene/Analysis/Tokenizer.pn

=cut

use strict;
use warnings;

use Plucene::Search::HitCollector;
use Plucene::Search::IndexSearcher;
use Plucene::Analysis::SimpleAnalyzer;
use Plucene::Document;
use Plucene::Document::Field;
use Plucene::Index::Writer;
use Plucene::Analysis::LowerCaseTokenizer;
use Plucene::Analysis::CharTokenizer;
use Plucene::Analysis::Standard::StandardTokenizer;

use Test::More tests => 13;
use File::Path;
use File::Temp qw/tempdir/;

use constant DIRECTORY => tempdir();

END { rmtree DIRECTORY }

#------------------------------------------------------------------------------
# Helper stuff
#------------------------------------------------------------------------------

sub data {
	return [
		wsc => { name => "Writing Solid Code" },
		rap => { name => "Rapid Development" },
		gui => { name => "GUI Bloopers" },
		ora => { name => "Using Oracle 8i" },
		app => { name => "Advanced Perl Programming" },
		xpe => { name => "Extreme Programming Explained" },
		boo => { name => "Boo-Hoo" },
		dbs => { name => "Designing From Both Sides of the Screen" },
		dbi => { name => "Programming the Perl DBI" },
	];
}

#------------------------------------------------------------------------------
# Indexing
#------------------------------------------------------------------------------

sub index_documents_Perl {
	my @data   = @{ data() };
	my $writer =
		Plucene::Index::Writer->new(DIRECTORY,
		Plucene::Analysis::SimpleAnalyzer->new(), 1);
	while (my ($id, $terms) = splice @data, 0, 2) {
		my $doc = Plucene::Document->new;
		$doc->add(Plucene::Document::Field->Keyword(id => $id));
		$doc->add(Plucene::Document::Field->UnStored(%$terms));
		$writer->add_document($doc);
	}
	$writer->optimize();    # THIS IS NOT AN OPTIONAL STEP
}

index_documents_Perl();

#------------------------------------------------------------------------------
# Tests
#------------------------------------------------------------------------------

my $sis = Plucene::Index::SegmentInfos->new;
$sis->read(DIRECTORY);

my @si     = $sis->segments;
my $reader = Plucene::Index::SegmentReader->new($si[0]);

{
	isa_ok my $tokenizer =
		Plucene::Analysis::Tokenizer->new({ reader => $reader }) =>
		'Plucene::Analysis::Tokenizer';
}

{    # normalizing with lowercase tokenizer
	isa_ok my $tokenizer =
		Plucene::Analysis::LowerCaseTokenizer->new({ reader => $reader }) =>
		'Plucene::Analysis::Tokenizer';
	isa_ok $tokenizer => 'Plucene::Analysis::LowerCaseTokenizer';
	my $norm = $tokenizer->normalize('SHOUT');
	is $norm => 'shout', "string normalized correctly (lowercase tokenizer)";
	ok $tokenizer->close, "closed lowercase tokenizer";
}

{    # normalizing with character tokenizer
	isa_ok my $tokenizer =
		Plucene::Analysis::CharTokenizer->new({ reader => $reader }) =>
		'Plucene::Analysis::Tokenizer';
	isa_ok $tokenizer => 'Plucene::Analysis::CharTokenizer';
	my $norm = $tokenizer->normalize('SHOUT');
	is $norm => 'SHOUT', "string normalized correctly (character tokenizer)";
	ok $tokenizer->close, "closed character tokenizer";
}

{    # normalize with standard tokenizer
	isa_ok my $tokenizer =
		Plucene::Analysis::Standard::StandardTokenizer->new(
		{ reader => $reader }) => 'Plucene::Analysis::Tokenizer';
	isa_ok $tokenizer => 'Plucene::Analysis::Standard::StandardTokenizer';
	my $norm = $tokenizer->normalize('SHOUT');
	is $norm => 'SHOUT', "string normalized correctly (standard tokenizer)";
	ok $tokenizer->close, "closed standard tokenizer";

}