#!/usr/bin/env perl
# ABSTRACT: uses MinHash & SpeedyFx to compare large text data
# PODNAME: minhash_cmp
use autodie;
use strict;
use utf8;
use warnings;
our $VERSION = '0.009'; # VERSION
use Getopt::Long;
use List::MoreUtils qw(distinct);
use Pod::Usage;
use Text::SpeedyFx;
GetOptions(
q(help) => \my $help,
q(epsilon=f) => \my $e,
q(k=i) => \my $k,
q(seed=i) => \my $seed,
q(bits=i) => \my $bits,
) or pod2usage(-verbose => 1);
pod2usage(-verbose => 2)
if $help or $#ARGV != 1;
$e = 0.05
unless $e;
$k = 0 + sprintf
q(%0.0f),
1 / ($e ** 2)
unless $k;
$seed = 0x4c53_4820
unless $seed;
$bits = 8
unless $bits;
my $hashes = [];
srand $seed;
push @{$hashes}, Text::SpeedyFx->new(int rand 2 ** 32, $bits)
for 1 .. $k;
my @text = map {
sub {
open my $fh, q(<:mmap), shift;
local $/ = undef;
my $data = <$fh>;
close $fh;
return $data;
}->($_)
} @ARGV;
my $match = 0;
for my $hash (@{$hashes}) {
++$match
if 1 == distinct
map {
$hash->hash_min($_);
} @text;
}
printf qq(k=%d; similarity=%0.5f\n), $k, $match / $k;
__END__
=pod
=encoding utf8
=head1 NAME
minhash_cmp - uses MinHash & SpeedyFx to compare large text data
=head1 VERSION
version 0.009
=head1 SYNOPSIS
minhash_cmp [options] FILE1 FILE2
=head1 DESCRIPTION
MinHash (or the min-wise independent permutations locality sensitive hashing scheme) is a technique for quickly estimating how similar two sets are.
=head1 OPTIONS
=over 4
=item --help
This.
=item --epsilon
Expected error value used to compute the number of different hash functions (default: 0.05).
=item --k
Number of different hash functions to use (default: 400; overrides C<--epsilon>).
=item --seed
Custom seed (integer).
=item --bits
How many bits do represent one character.
The default value, 8, sacrifices Unicode handling but is fast and low on memory footprint.
The value of 18 encompasses I<Basic Multilingual>, I<Supplementary Multilingual> and I<Supplementary Ideographic> planes.
=back
=head1 CAVEATS
Under C<bits=18> setting, each initialized hash function consumes ~500KB.
=head1 SEE ALSO
=over 4
=item *
L<MinHash|http://en.wikipedia.org/wiki/MinHash>
=item *
L<Text::SpeedyFx>
=back
=head1 AUTHOR
Stanislaw Pusep <stas@sysd.org>
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2013 by Stanislaw Pusep.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut