The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
# ABSTRACT: uses MinHash & SpeedyFx to compare large text data
# PODNAME: minhash_cmp
use autodie;
use strict;
use utf8;
use warnings;

our $VERSION = '0.009'; # VERSION


use Getopt::Long;
use List::MoreUtils qw(distinct);
use Pod::Usage;
use Text::SpeedyFx;

GetOptions(
    q(help)             => \my $help,
    q(epsilon=f)        => \my $e,
    q(k=i)              => \my $k,
    q(seed=i)           => \my $seed,
    q(bits=i)           => \my $bits,
) or pod2usage(-verbose => 1);
pod2usage(-verbose => 2)
    if $help or $#ARGV != 1;

$e = 0.05
    unless $e;
$k = 0 + sprintf
    q(%0.0f),
    1 / ($e ** 2)
    unless $k;
$seed = 0x4c53_4820
    unless $seed;
$bits = 8
    unless $bits;

my $hashes = [];
srand $seed;
push @{$hashes}, Text::SpeedyFx->new(int rand 2 ** 32, $bits)
    for 1 .. $k;

my @text = map {
    sub {
        open my $fh, q(<:mmap), shift;
        local $/ = undef;
        my $data = <$fh>;
        close $fh;

        return $data;
    }->($_)
} @ARGV;

my $match = 0;
for my $hash (@{$hashes}) {
    ++$match
        if 1 == distinct
            map {
                $hash->hash_min($_);
            } @text;
}

printf qq(k=%d; similarity=%0.5f\n), $k, $match / $k;

__END__

=pod

=encoding utf8

=head1 NAME

minhash_cmp - uses MinHash & SpeedyFx to compare large text data

=head1 VERSION

version 0.009

=head1 SYNOPSIS

    minhash_cmp [options] FILE1 FILE2

=head1 DESCRIPTION

MinHash (or the min-wise independent permutations locality sensitive hashing scheme) is a technique for quickly estimating how similar two sets are.

=head1 OPTIONS

=over 4

=item --help

This.

=item --epsilon

Expected error value used to compute the number of different hash functions (default: 0.05).

=item --k

Number of different hash functions to use (default: 400; overrides C<--epsilon>).

=item --seed

Custom seed (integer).

=item --bits

How many bits do represent one character.
The default value, 8, sacrifices Unicode handling but is fast and low on memory footprint.
The value of 18 encompasses I<Basic Multilingual>, I<Supplementary Multilingual> and I<Supplementary Ideographic> planes.

=back

=head1 CAVEATS

Under C<bits=18> setting, each initialized hash function consumes ~500KB.

=head1 SEE ALSO

=over 4

=item *

L<MinHash|http://en.wikipedia.org/wiki/MinHash>

=item *

L<Text::SpeedyFx>

=back

=head1 AUTHOR

Stanislaw Pusep <stas@sysd.org>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2013 by Stanislaw Pusep.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut