#!/usr/bin/perl
use lib qw(../lib);
use strict;
use warnings;
$|++;
use Lingua::Stem;
use Lingua::Stem::En;
use Lingua::Stem::Snowball;
use Benchmark::Timer;
my $timer = Benchmark::Timer->new;
die "Usage: ./benchmark_stemmers.plx TEXTFILE"
unless @ARGV;
# retrieve, pre-process and tokenize text
my $text = do {
open( my $fh, '<', $ARGV[0] )
or die "Couldn't open file '$ARGV[0]' for reading: $!";
local $/;
<$fh>;
};
$text = lc($text);
$text =~ s/[^a-z']/ /gs;
$text =~ s/\B'//g;
$text =~ s/'\B//g;
my @tokens = split( ' ', $text );
for my $iter ( 1 .. 10 ) {
print "$iter ";
my ( @out, $out );
my $snowball = Lingua::Stem::Snowball->new( lang => 'en' );
my $lingua_stem = Lingua::Stem->new( -locale => 'EN' );
# LSS
$timer->start('LSS');
@out = $snowball->stem(\@tokens);
$timer->stop('LSS');
undef @out;
# stem_in_place, if this version of LSS is recent enough
if ( $snowball->can('stem_in_place') ) {
my @copy = @tokens;
$timer->start('LSS2');
$snowball->stem_in_place(\@copy);
$timer->stop('LSS2');
}
# LS
$timer->start('LS');
$out = $lingua_stem->stem(@tokens);
$timer->stop('LS');
undef $out;
# LS, with stem caching
$lingua_stem->stem_caching({ -level => 2 });
$timer->start('LS2');
$out = $lingua_stem->stem(@tokens);
$timer->stop('LS2');
undef $out;
# LS, with stem caching and stem in place
{
$lingua_stem->stem_caching({ -level => 2 });
my @copy = @tokens;
my $copy_ref = \@copy;
$timer->start('LS2SIP');
$out = $lingua_stem->stem_in_place(@copy);
$timer->stop('LS2SIP');
undef $out;
}
}
# prepare vars used in the report
my $num_tokens = scalar @tokens;
my %unique;
$unique{$_} = 1 for @tokens;
my $num_unique = scalar keys %unique;
my $ls_ver = $Lingua::Stem::VERSION;
my $lss_ver = $Lingua::Stem::Snowball::VERSION;
$lss_ver =~ s/_.*//;
my %results = $timer->results;
# print the report
printf('
|--------------------------------------------------------------------|
| source: %-19s | words: %-6d | unique words: %-6d |
|--------------------------------------------------------------------|
| module | config | avg secs | rate |
|--------------------------------------------------------------------|',
$ARGV[0], $num_tokens, $num_unique );
printf('
| Lingua::Stem %.2f | no cache | %.3f | %-7d |
| Lingua::Stem %.2f | cache level 2 | %.3f | %-7d |
| Lingua::Stem %.2f | cachelv2, sip | %.3f | %-7d |
| Lingua::Stem::Snowball %.2f | stem | %.3f | %-7d |',
$ls_ver, $results{LS}, ($num_tokens/$results{LS}),
$ls_ver, $results{LS2}, ($num_tokens/$results{LS2}),
$ls_ver, $results{LS2SIP}, ($num_tokens/$results{LS2SIP}),
$lss_ver, $results{LSS}, ($num_tokens/$results{LSS}),
);
printf('
| Lingua::Stem::Snowball %-4s | stem_in_place | %.3f | %-7d |',
$lss_ver, $results{LSS2}, ($num_tokens/$results{LSS2}) )
if exists $results{LSS2};
print "\n|" . ('-' x 68) . "|\n";