#!/usr/bin/env perl
use autodie;
use strict;
use utf8;
use warnings;
use Benchmark qw(cmpthese :hireswallclock);
use Digest::MurmurHash qw(murmur_hash);
use Text::SpeedyFx;
my $data = do {
local $/ = undef;
open my $fh, q(<:mmap), q(enwik8);
<$fh>;
};
my $sfx_latin1 = Text::SpeedyFx->new(1, 8);
my $sfx = Text::SpeedyFx->new(1);
cmpthese(10 => {
hash => sub { $sfx_latin1->hash($data) },
hash_utf8 => sub { $sfx->hash($data) },
hash_fv => sub { $sfx_latin1->hash_fv($data, 1024 << 3) },
hash_min => sub { $sfx_latin1->hash_min($data) },
hash_min_utf8 => sub { $sfx->hash_min($data) },
murmur_utf8 => sub { tokenize($data) },
});
sub tokenize {
my ($data) = @_;
my $fv;
++$fv->{murmur_hash(lc $1)}
while $data =~ /(\w+)/gx;
return $fv;
}